Skip to content

how to convert a dict generator into a huggingface dataset.  #4417

@StephennFernandes

Description

@StephennFernandes

Link

No response

Description

Hey there, I have used seqio to get a well distributed mixture of samples from multiple dataset. However the resultant output from seqio is a python generator dict, which I cannot produce back into huggingface dataset.

The generator contains all the samples needed for training the model but I cannot convert it into a huggingface dataset.

The code looks like this:

for ex in seqio_data:
print(ex[“text”])

I need to convert the seqio_data (generator) into huggingface dataset.

the complete seqio code goes here:

import functools

import seqio
import tensorflow as tf
import t5.data
from datasets import load_dataset
from t5.data import postprocessors
from t5.data import preprocessors
from t5.evaluation import metrics
from seqio import FunctionDataSource, utils

TaskRegistry = seqio.TaskRegistry



def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_params=None):
    dataset = load_dataset(**dataset_params)
    if shuffle:
        if seed:
            dataset = dataset.shuffle(seed=seed)
        else:
            dataset = dataset.shuffle()
    while True:
        for item in dataset[str(split)]:
            yield item[column]


def dataset_fn(split, shuffle_files, seed=None, dataset_params=None):
    return tf.data.Dataset.from_generator(
        functools.partial(gen_dataset, split, shuffle_files, seed, dataset_params=dataset_params),
        output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_name)
    )


@utils.map_over_dataset
def target_to_key(x, key_map, target_key):
    """Assign the value from the dataset to target_key in key_map"""
    return {**key_map, target_key: x}



dataset_name = 'oscar-corpus/OSCAR-2109'
subset= 'mr'
dataset_params = {"path": dataset_name, "language":subset, "use_auth_token":True}
dataset_shapes = None

TaskRegistry.add(
    "oscar_marathi_corpus",
    source=seqio.FunctionDataSource(
        dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
        splits=("train", "validation"),
        caching_permitted=False,
        num_input_examples=dataset_shapes,
    ),
preprocessors=[
functools.partial(
target_to_key, key_map={
"targets": None,
}, target_key="targets")],
    output_features={"targets": seqio.Feature(vocabulary=seqio.PassThroughVocabulary, add_eos=False, dtype=tf.string, rank=0)},
    metric_fns=[]
)

dataset = seqio.get_mixture_or_task("oscar_marathi_corpus").get_dataset(
    sequence_length=None,
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)
for _, ex in zip(range(5), dataset):
     print(ex['targets'].numpy().decode())

Owner

No response

Metadata

Metadata

Assignees

Labels

questionFurther information is requested

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions