Dataset: Ten Thousand Dreams#
The dataset we will create contains descriptions of dreams and their interpretations from Gustavus Hindman Miller’s “Ten Thousand Dreams, Interpreted.”
It’s inspired by Allison Parrish’s work “I Waded In Clear Water”. aparrish/nanogenmo2014
For her work, she prepared the data as following
Each word has multiple interpretations, and most of these interpretations can be broken down into what I call an action and a denotation:
To see an oak full of acorns, denotes increase and promotion.
In this entry, See an oak full of acorns is the action, and increase and promotion is the denotation. The text of this novel was made by extracting the actions and changing them to first-person, past-tense sentences:
I saw an oak full of acorns.
In our final dataset, the sentences from above will look like this:
{'conversations': [{'role': 'user', 'content': 'I saw an oak full of acorns.'},
{'role': 'assistant',
'content': 'To see an oak full of acorns, denotes increase and promotion. \n'}]}
Get the data#
Probably the easiest way to get the data is to work with Parrish’s code. It was written with Python 2 and uses a library that works only with Python 2, so we have to create a specific environment for that.
conda create -n datadreams python=2.7
Manually install pattern: https://digiasset.org/html/pattern.html
Below, you can see the modiefied code, that reads the text from Gustavus Hindman Miller and returns a text file with lines like:
I saw a forest of oaks.|To dream of seeing a forest of oaks, signifies great prosperity in all conditions of life.
I saw an oak full of acorns.|To see an oak full of acorns, denotes increase and promotion.
The |
seperates the two parts and will be used later to assign the texts to the roles of user
and assistant
.
import sys
import re
import random
import time
from pattern.en import parsetree, Word, conjugate, PAST, sentiment
from pattern.search import search
# from elaborate import elaborate_on, ElaborationImpossible
# from badwords import is_blacklisted
def phrase_replace(phrase):
output = ["I", conjugate(phrase[0].lemma, tense=PAST)]
for word in phrase[1:]:
if word.string == 'you' and word.chunk.role == 'SBJ':
output.append('I')
elif word.string == 'you':
output.append('me')
elif word.string == 'her' and word.type == 'PRP$':
output.append('my')
elif word.string == 'her' and word.type == 'PRP':
output.append('me')
elif word.string in ('him', 'her') and word.chunk.role == 'OBJ':
output.append('me')
elif word.string in ('he', 'she') and word.chunk.role == 'SBJ':
output.append('I')
elif word.string == 'her':
output.append('my')
elif word.string == 'his':
output.append('my')
elif word.string == 'your':
output.append('my')
elif word.string in ('yourself', 'herself', 'himself'):
output.append('myself')
elif word.string in ('hers', 'yours'):
output.append('mine')
elif word.type in ('VBP', 'VBZ'):
output.append(conjugate(word.string, tense=PAST))
else:
output.append(word.string)
output_str = ' '.join(output)
output_str = output_str.replace("caed n't", "couldn't")
output_str = output_str.replace("thought me was", "thought I was")
return output_str
def extract_verb_phrases(tree):
verb_phrase_matches = search('to|you {VP}', tree)
phrases = list()
if len(verb_phrase_matches) > 0:
possible_matches = list()
for match in verb_phrase_matches:
if match.group(1)[0].string == "dream":
continue
phrases.append(tree[match.group(1).start:])
return phrases
def extract_verbs(tree):
verb_matches = search('to|you {VB*}', tree)
phrases = list()
for match in verb_matches:
if match.group(1)[0].type in ('VBG', 'VBZ'): continue
if match.group(1)[0].string == "dream": continue
phrases.append(tree[match.group(1).start:])
return phrases
def extract_to_dream_that(tree):
matches = search('to dream that she|he {VB*}', tree)
phrases = list()
for match in matches:
phrases.append(tree[match.group(1).start:])
return phrases
def extract_gerunds(tree):
matches = search('to dream of {VBG}', tree)
phrases = list()
for match in matches:
phrases.append(tree[match.group(1).start:])
return phrases
phrase_scores = list()
para = ""
output = []
with open('data/dreams.txt', 'r') as f:
txt = f.readlines()
for line in txt:
line = line.strip()
if line != "":
para += line + " "
else:
#print para
#print parsetree(para)
#print "------"
# print para
parts = re.split(r'\s*[,;]\s*', para)
if len(parts) > 1 and not(parts[0].startswith('[')) and not('.' in parts[0]):
action = parts[0]
denotes = ' '.join(parts[1:])
#action = re.sub(r"^to dream (that)?", "", action, count=0, flags=re.I)
action = re.sub(r"in (your |a )?dreams?", "", action)
action = re.sub(r"while dreaming", "", action)
action = re.sub(r"\{[^}]*\}", "", action)
#print action
tree = parsetree(action, lemmata=True, relations=True)[0]
#print tree
verb_phrases = extract_verb_phrases(tree)
#print "verb phrases: " + str(verb_phrases)
verbs = extract_verbs(tree)
#print "verbs: " + str(verbs)
dream_thats = extract_to_dream_that(tree)
#print "dream thats: " + str(dream_thats)
gerunds = extract_gerunds(tree)
#print "gerunds: " + str(gerunds)
phrases = set()
for phrase in verb_phrases + verbs + dream_thats + gerunds:
phrases.add(phrase_replace(phrase) + ".")
#print phrases
for phrase in phrases:
# if not(is_blacklisted(phrase)):
# phrase_scores.append((phrase, int(sentiment(denotes)[0]*10), denotes))
print phrase + '|' + para
output.append(phrase + '|' + para)
# print '|'
# print para
para = ""
with open('data/dreams_pairs.txt', 'w') as f:
f.write('\n'.join(output))
Create the dataset#
with open('data/dreams_pairs.txt') as f:
txt = f.readlines()
for line in txt[:4]:
print(line)
I was abandoned.|To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.
I abandoned others.|To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them.
I abandoned.|If it is your house that you abandon, you will soon come to grief in experimenting with fortune.
I abandoned my sweetheart.|If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.
To use this data for training a chatbot, we have to transform it into a specific form:
{'conversations': [{'role': 'user', 'content': 'I saw an oak full of acorns..'},
{'role': 'assistant',
'content': 'To see an oak full of acorns, denotes increase and promotion. \n'}]}
Then we will create a list of all of these dictionaries. (So we’ll have a list of dictionaries, which store a list of dictionaries.)
def str_to_dict(user, assistant):
# create a list with the entries as dictionaries
conversation_data = [{'role':'user', 'content':user}, {'role':'assistant', 'content':assistant}]
# create a dictionary with key 'conversations' and add the list as value
dictionary = {'conversations':conversation_data}
return dictionary
line = txt[0]
print(line)
user, assistant = line.strip().split('|') # Split a line into the text of user and assistant
print(user)
print(assistant)
I was abandoned.|To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.
I was abandoned.
To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.
dataset = []
for line in txt:
# split into user and assistant
user, assistant = line.strip().split('|')
data = str_to_dict(user, assistant)
dataset.append(data)
dataset[:5]
[{'conversations': [{'role': 'user', 'content': 'I was abandoned.'},
{'role': 'assistant',
'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.'}]},
{'conversations': [{'role': 'user', 'content': 'I abandoned others.'},
{'role': 'assistant',
'content': 'To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them.'}]},
{'conversations': [{'role': 'user', 'content': 'I abandoned.'},
{'role': 'assistant',
'content': 'If it is your house that you abandon, you will soon come to grief in experimenting with fortune.'}]},
{'conversations': [{'role': 'user', 'content': 'I abandoned my sweetheart.'},
{'role': 'assistant',
'content': 'If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.'}]},
{'conversations': [{'role': 'user', 'content': 'I abandoned a mistress.'},
{'role': 'assistant',
'content': 'If you abandon a mistress, you will unexpectedly come into a goodly inheritance.'}]}]
len(dataset)
3324
Save the dataset#
import json
with open('data/dreams.json', 'w', encoding='utf-8') as f:
json.dump(dataset, f)
Load the dataset#
!pip install datasets
from datasets import load_dataset # https://huggingface.co/docs/datasets/loading
dataset = load_dataset('json', data_files='data/dreams.json')
dataset
DatasetDict({
train: Dataset({
features: ['conversations'],
num_rows: 3324
})
})
dataset['train'][:5]
{'conversations': [[{'content': 'I was abandoned.', 'role': 'user'},
{'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.',
'role': 'assistant'}],
[{'content': 'I abandoned others.', 'role': 'user'},
{'content': 'To abandon others, you will see unhappy conditions piled thick around you, leaving little hope of surmounting them.',
'role': 'assistant'}],
[{'content': 'I abandoned.', 'role': 'user'},
{'content': 'If it is your house that you abandon, you will soon come to grief in experimenting with fortune.',
'role': 'assistant'}],
[{'content': 'I abandoned my sweetheart.', 'role': 'user'},
{'content': 'If you abandon your sweetheart, you will fail to recover lost valuables, and friends will turn aside from your favors.',
'role': 'assistant'}],
[{'content': 'I abandoned a mistress.', 'role': 'user'},
{'content': 'If you abandon a mistress, you will unexpectedly come into a goodly inheritance.',
'role': 'assistant'}]]}
dataset['train'][0]['conversations']
[{'content': 'I was abandoned.', 'role': 'user'},
{'content': 'To dream that you are abandoned, denotes that you will have difficulty in framing your plans for future success.',
'role': 'assistant'}]