Photo by @weirick

Building a Text Dataset

Loading text data in TensorFlow — and a note on building datasets with text

import tensorflow as tfimport tensorflow_datasets as tfds
import os
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)

def labeler(example, index):
return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
lines_dataset =, file_name))
labeled_dataset = ex: labeler(ex, i))
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())

vocab_size = len(vocabulary_set)

Encode examples

encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

Building datasets with text

AI Policy and Ethics at Student at University of Copenhagen MSc in Social Data Science. All views are my own.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store