Dataset: Ten Thousand Dreams

Dataset: Ten Thousand Dreams#

The dataset we will create contains descriptions of dreams and their interpretations from Gustavus Hindman Miller’s “Ten Thousand Dreams, Interpreted.”

It’s inspired by Allison Parrish’s work “I Waded In Clear Water”. aparrish/nanogenmo2014

For her work, she prepared the data as following

Each word has multiple interpretations, and most of these interpretations can be broken down into what I call an action and a denotation:

To see an oak full of acorns, denotes increase and promotion.

In this entry, See an oak full of acorns is the action, and increase and promotion is the denotation. The text of this novel was made by extracting the actions and changing them to first-person, past-tense sentences:

I saw an oak full of acorns.

In our final dataset, the sentences from above will look like this:

{'conversations': [{'role': 'user', 'content': 'I saw an oak full of acorns.'},
  {'role': 'assistant',
   'content': 'To see an oak full of acorns, denotes increase and promotion. \n'}]}

Get the data#

Probably the easiest way to get the data is to work with Parrish’s code. It was written with Python 2 and uses a library that works only with Python 2, so we have to create a specific environment for that.

conda create -n datadreams python=2.7

Manually install pattern: https://digiasset.org/html/pattern.html

Below, you can see the modiefied code, that reads the text from Gustavus Hindman Miller and returns a text file with lines like:

I saw a forest of oaks.|To dream of seeing a forest of oaks, signifies great prosperity in all conditions of life.

I saw an oak full of acorns.|To see an oak full of acorns, denotes increase and promotion.

The | seperates the two parts and will be used later to assign the texts to the roles of user and assistant.

import sys
import re
import random
import time

from pattern.en import parsetree, Word, conjugate, PAST, sentiment
from pattern.search import search

# from elaborate import elaborate_on, ElaborationImpossible
# from badwords import is_blacklisted

def phrase_replace(phrase):
	output = ["I", conjugate(phrase[0].lemma, tense=PAST)]
	for word in phrase[1:]:
		if word.string == 'you' and word.chunk.role == 'SBJ':
			output.append('I')
		elif word.string == 'you':
			output.append('me')
		elif word.string == 'her' and word.type == 'PRP$':
			output.append('my')
		elif word.string == 'her' and word.type == 'PRP':
			output.append('me')
		elif word.string in ('him', 'her') and word.chunk.role == 'OBJ':
			output.append('me')
		elif word.string in ('he', 'she') and word.chunk.role == 'SBJ':
			output.append('I')
		elif word.string == 'her':
			output.append('my')
		elif word.string == 'his':
			output.append('my')
		elif word.string == 'your':
			output.append('my')
		elif word.string in ('yourself', 'herself', 'himself'):
			output.append('myself')
		elif word.string in ('hers', 'yours'):
			output.append('mine')
		elif word.type in ('VBP', 'VBZ'):
			output.append(conjugate(word.string, tense=PAST))
		else:
			output.append(word.string)
	output_str = ' '.join(output)
	output_str = output_str.replace("caed n't", "couldn't")
	output_str = output_str.replace("thought me was", "thought I was")
	return output_str

def extract_verb_phrases(tree):
	verb_phrase_matches = search('to|you {VP}', tree)
	phrases = list()
	if len(verb_phrase_matches) > 0:
		possible_matches = list()
		for match in verb_phrase_matches:
			if match.group(1)[0].string == "dream":
				continue
			phrases.append(tree[match.group(1).start:])
	return phrases

def extract_verbs(tree):
	verb_matches = search('to|you {VB*}', tree)
	phrases = list()
	for match in verb_matches:
		if match.group(1)[0].type in ('VBG', 'VBZ'): continue
		if match.group(1)[0].string == "dream": continue
		phrases.append(tree[match.group(1).start:])
	return phrases

def extract_to_dream_that(tree):
	matches = search('to dream that she|he {VB*}', tree)
	phrases = list()
	for match in matches:
		phrases.append(tree[match.group(1).start:])
	return phrases

def extract_gerunds(tree):
	matches = search('to dream of {VBG}', tree)
	phrases = list()
	for match in matches:
		phrases.append(tree[match.group(1).start:])
	return phrases

phrase_scores = list()
para = ""
output = []
with open('data/dreams.txt', 'r') as f:
	txt = f.readlines()

for line in txt:
	line = line.strip()
	if line != "":
		para += line + " "
	else:
		#print para
		#print parsetree(para)
		#print "------"
		# print para
		parts = re.split(r'\s*[,;]\s*', para)
		if len(parts) > 1 and not(parts[0].startswith('[')) and not('.' in parts[0]):
			action = parts[0]
			denotes = ' '.join(parts[1:])
			#action = re.sub(r"^to dream (that)?", "", action, count=0, flags=re.I)
			action = re.sub(r"in (your |a )?dreams?", "", action)
			action = re.sub(r"while dreaming", "", action)
			action = re.sub(r"\{[^}]*\}", "", action)
			#print action
			tree = parsetree(action, lemmata=True, relations=True)[0]
			#print tree
			verb_phrases = extract_verb_phrases(tree)
			#print "verb phrases: " + str(verb_phrases)
			verbs = extract_verbs(tree)
			#print "verbs: " + str(verbs)
			dream_thats = extract_to_dream_that(tree)
			#print "dream thats: " + str(dream_thats)
			gerunds = extract_gerunds(tree)
			#print "gerunds: " + str(gerunds)
			phrases = set()
			for phrase in verb_phrases + verbs + dream_thats + gerunds:
				phrases.add(phrase_replace(phrase) + ".")
			#print phrases
			for phrase in phrases:
				# if not(is_blacklisted(phrase)):
				# 	phrase_scores.append((phrase, int(sentiment(denotes)[0]*10), denotes))
				print phrase + '|' + para
				output.append(phrase + '|' + para)
			# print '|'
			# print para
		para = ""

with open('data/dreams_pairs.txt', 'w') as f:
	f.write('\n'.join(output))

Create the dataset#

with open('data/dreams_pairs.txt') as f:
    txt = f.readlines()

for line in txt[:4]:
    print(line)

I was abandoned.|To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success. 

I abandoned others.|To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them. 

I abandoned.|If it is your house that you abandon, you will soon come to grief in experimenting with fortune. 

I abandoned my sweetheart.|If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.

To use this data for training a chatbot, we have to transform it into a specific form:

{'conversations': [{'role': 'user', 'content': 'I saw an oak full of acorns..'},
  {'role': 'assistant',
   'content': 'To see an oak full of acorns, denotes increase and promotion. \n'}]}

Then we will create a list of all of these dictionaries. (So we’ll have a list of dictionaries, which store a list of dictionaries.)

def str_to_dict(user, assistant):
    # create a list with the entries as dictionaries
    conversation_data = [{'role':'user', 'content':user}, {'role':'assistant', 'content':assistant}]
    # create a dictionary with key 'conversations' and add the list as value
    dictionary = {'conversations':conversation_data}
    return dictionary

line = txt[0]
print(line)
user, assistant = line.strip().split('|')  # Split a line into the text of user and assistant
print(user)
print(assistant)

I was abandoned.|To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success. 

I was abandoned.
To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.

dataset = []
for line in txt:
    # split into user and assistant
    user, assistant = line.strip().split('|')
    data = str_to_dict(user, assistant)
    dataset.append(data)

dataset[:5]

[{'conversations': [{'role': 'user', 'content': 'I was abandoned.'},
   {'role': 'assistant',
    'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.'}]},
 {'conversations': [{'role': 'user', 'content': 'I abandoned others.'},
   {'role': 'assistant',
    'content': 'To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them.'}]},
 {'conversations': [{'role': 'user', 'content': 'I abandoned.'},
   {'role': 'assistant',
    'content': 'If it is your house that you abandon, you will soon come to grief in experimenting with fortune.'}]},
 {'conversations': [{'role': 'user', 'content': 'I abandoned my sweetheart.'},
   {'role': 'assistant',
    'content': 'If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.'}]},
 {'conversations': [{'role': 'user', 'content': 'I abandoned a mistress.'},
   {'role': 'assistant',
    'content': 'If you abandon a mistress, you will unexpectedly come into a goodly inheritance.'}]}]

len(dataset)

Save the dataset#

import json

with open('data/dreams.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f)

Load the dataset#

!pip install datasets 

from datasets import load_dataset  # https://huggingface.co/docs/datasets/loading
dataset = load_dataset('json', data_files='data/dreams.json')

dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 3324
    })
})

dataset['train'][:5]

{'conversations': [[{'content': 'I was abandoned.', 'role': 'user'},
   {'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.',
    'role': 'assistant'}],
  [{'content': 'I abandoned others.', 'role': 'user'},
   {'content': 'To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them.',
    'role': 'assistant'}],
  [{'content': 'I abandoned.', 'role': 'user'},
   {'content': 'If it is your house that you abandon, you will soon come to grief in experimenting with fortune.',
    'role': 'assistant'}],
  [{'content': 'I abandoned my sweetheart.', 'role': 'user'},
   {'content': 'If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.',
    'role': 'assistant'}],
  [{'content': 'I abandoned a mistress.', 'role': 'user'},
   {'content': 'If you abandon a mistress, you will unexpectedly come into a goodly inheritance.',
    'role': 'assistant'}]]}

dataset['train'][0]['conversations']

[{'content': 'I was abandoned.', 'role': 'user'},
 {'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.',
  'role': 'assistant'}]