Hey there, I have used seqio to get a well distributed mixture of samples from multiple dataset. However the resultant output from seqio is a python generator dict, which I cannot produce back into huggingface dataset.
The generator contains all the samples needed for training the model but I cannot convert it into a huggingface dataset.
for ex in seqio_data:
print(ex[“text”])
I need to convert the seqio_data (generator) into huggingface dataset.
import functools
import seqio
import tensorflow as tf
import t5.data
from datasets import load_dataset
from t5.data import postprocessors
from t5.data import preprocessors
from t5.evaluation import metrics
from seqio import FunctionDataSource, utils
TaskRegistry = seqio.TaskRegistry
def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_params=None):
dataset = load_dataset(**dataset_params)
if shuffle:
if seed:
dataset = dataset.shuffle(seed=seed)
else:
dataset = dataset.shuffle()
while True:
for item in dataset[str(split)]:
yield item[column]
def dataset_fn(split, shuffle_files, seed=None, dataset_params=None):
return tf.data.Dataset.from_generator(
functools.partial(gen_dataset, split, shuffle_files, seed, dataset_params=dataset_params),
output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_name)
)
@utils.map_over_dataset
def target_to_key(x, key_map, target_key):
"""Assign the value from the dataset to target_key in key_map"""
return {**key_map, target_key: x}
dataset_name = 'oscar-corpus/OSCAR-2109'
subset= 'mr'
dataset_params = {"path": dataset_name, "language":subset, "use_auth_token":True}
dataset_shapes = None
TaskRegistry.add(
"oscar_marathi_corpus",
source=seqio.FunctionDataSource(
dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
splits=("train", "validation"),
caching_permitted=False,
num_input_examples=dataset_shapes,
),
preprocessors=[
functools.partial(
target_to_key, key_map={
"targets": None,
}, target_key="targets")],
output_features={"targets": seqio.Feature(vocabulary=seqio.PassThroughVocabulary, add_eos=False, dtype=tf.string, rank=0)},
metric_fns=[]
)
dataset = seqio.get_mixture_or_task("oscar_marathi_corpus").get_dataset(
sequence_length=None,
split="train",
shuffle=True,
num_epochs=1,
shard_info=seqio.ShardInfo(index=0, num_shards=10),
use_cached=False,
seed=42
)
for _, ex in zip(range(5), dataset):
print(ex['targets'].numpy().decode())
Link
No response
Description
Hey there, I have used seqio to get a well distributed mixture of samples from multiple dataset. However the resultant output from seqio is a python generator dict, which I cannot produce back into huggingface dataset.
The generator contains all the samples needed for training the model but I cannot convert it into a huggingface dataset.
The code looks like this:
I need to convert the seqio_data (generator) into huggingface dataset.
the complete seqio code goes here:
Owner
No response