From 1b953d51428c66e65364eb7652f7516352118190 Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Fri, 20 Aug 2021 22:44:05 -0700
Subject: [PATCH 1/6] style fixes

---
 markov.py | 45 +++++++++++++--------------------------------
 1 file changed, 13 insertions(+), 32 deletions(-)
diff --git a/markov.py b/markov.py
index 23e41bc..7317699 100755
--- a/markov.py
+++ b/markov.py
@@ -1,6 +1,5 @@
 import yaml
 import random
-from secrets import SystemRandom
 from multiprocess import Lock, Manager, Process
 
 BACKUP_FILE = "codebro.yaml"
@@ -19,13 +18,11 @@ def load_corpus(self, source_file: str):
             return yaml.load(infile.read(), Loader=yaml.Loader)
 
     def generate_markov_text(self, words: list, cache: dict, seed_phrase=None):
-        w1, w2 = "<START>", ""
         if seed_phrase:
-            w1, w2 = seed_phrase[0], seed_phrase[1]
+            w1, w2 = seed_phrase[:2]
         else:
-            urandom = SystemRandom()
             valid_starts = [(x[0], x[1]) for x in cache.keys() if x[0] == "<START>"]
-            w1, w2 = valid_starts[urandom.randint(0, len(valid_starts) - 1)]
+            w1, w2 = random.choice(valid_starts)
 
         gen_words = []
         while True:
@@ -59,11 +56,11 @@ def learn(self, sentence: str):
         # strip, uppercase, and check for inclusion in IGNORE_WORDS list
         is_ignored = lambda x: x.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in IGNORE_WORDS
         tokens = [x for x in tokens if not is_ignored(x)]
-        if len(tokens) == 0:
+        if not tokens:
             return  # nothing to learn here!
 
-        tokens[len(tokens) - 1] = tokens[len(tokens) - 1].strip(".?!")
-        tokens = [u"<START>"] + tokens + [u"<STOP>"]
+        tokens[-1] = tokens[-1].strip(".?!")
+        tokens = [u"<START>", *tokens, u"<STOP>"]
         indexes_with_stops = [tokens.index(x) for x in tokens if x.strip(".?!") != x]
         for i in indexes_with_stops[::-1]:
             tokens[i] = tokens[i].strip(".?!")
@@ -76,41 +73,25 @@ def learn(self, sentence: str):
         # there must be a better way to serialize from the proxy ..
         local_words = [word for word in self.words]
         with open('codebro.yaml', 'w') as outfile:
-            lk.acquire()
-            outfile.write(yaml.dump(local_words, default_flow_style=True))
-            lk.release()
+            with lk:
+                outfile.write(yaml.dump(local_words, default_flow_style=True))
 
     def create_response(self, prompt="", learn=False):
-        prompt_tokens = prompt.split()
-
         # set seedword from somewhere in words if there's no prompt
-        if len(prompt_tokens) < 1:
-            seed = random.randint(0, len(self.words)-1)
-            prompt_tokens.append(self.words[seed])
+        prompt_tokens = prompt.split() or [random.choice(self.words)]
 
         # create a set of lookups for phrases that start with words
         # contained in prompt phrase
-        seed_tuples = []
-        for i in range(0, len(prompt_tokens)-2):
-            seed_phrase = ("<START>", prompt_tokens[i])
-            seed_tuples.append(seed_phrase)
+        seed_tuples = [("<START>", tok) for tok in prompt_tokens[:-2]]
 
         # lookup seeds in cache; compile a list of 'hits'
-        seed_phrase = None
-        valid_seeds = []
-        for seed in seed_tuples:
-            if seed in self.cache:
-                valid_seeds.append(seed)
+        valid_seeds = [seed for seed in seed_tuples if seed in self.cache]
 
         # either seed the lookup with a randomly selected valid seed,
         # or if there were no 'hits' generate with no seedphrase
-        if len(valid_seeds) > 0:
-            seed_phrase = valid_seeds[random.randrange(0, len(valid_seeds), 1)]
-            response = self.generate_markov_text(self.words, self.cache, seed_phrase)
-        else:
-            response = self.generate_markov_text(self.words, self.cache)
+        seed_phrase = random.choice(valid_seeds) if valid_seeds else None
+        response = self.generate_markov_text(self.words, self.cache, seed_phrase)
 
         if learn:
-            p = Process(target=self.learn, args=(prompt,))
-            p.start()
+            Process(target=self.learn, args=(prompt,)).start()
         return response

From 1891aeb47a3bbcce6dba97ffb86d8fb1b7dd57fd Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Wed, 25 Aug 2021 02:13:09 -0700
Subject: [PATCH 2/6] some optimizations for markov generation

cache start words separately to avoid scans. avoid storing useless grams
---
 markov.py | 107 ++++++++++++++++++++++++++----------------------------
 1 file changed, 51 insertions(+), 56 deletions(-)

diff --git a/markov.py b/markov.py
index f1aa75e..26ffce4 100755
--- a/markov.py
+++ b/markov.py
@@ -2,6 +2,8 @@
 import random
 from multiprocess import Lock, Manager, Process
 
+START = "<START>"
+STOP = "<STOP>"
 
 # instantiate a Markov object with the source file
 class Markov:
@@ -9,32 +11,30 @@ def __init__(self, brain_file: str, ignore_words, skip_mp=False):
         self.brain_file = brain_file
         self.ignore_words = ignore_words
         self.skip_mp = skip_mp
-        if not self.skip_mp:
+        if self.skip_mp:
+            self.words = list(self.load_corpus(brain_file))
+        else:
             self.manager = Manager()
             self.words = self.manager.list(self.load_corpus(brain_file))
-            self.cache = self.manager.dict(self.database(self.words, {}))
-        else:
-            self.words = list(self.load_corpus(brain_file))
-            self.cache = dict(self.database(self.words, {}))
+        self.update_cache()
 
     @classmethod
     def load_corpus(cls, source_file: str):
         with open(source_file, 'r') as infile:
             return yaml.load(infile.read(), Loader=yaml.Loader)
 
-    @classmethod
-    def generate_markov_text(cls, words: list, cache: dict, seed_phrase=None):
-        if seed_phrase:
-            w1, w2 = seed_phrase[:2]
+    def generate_markov_text(self, seed=None):
+        if seed:
+            w1 = seed
         else:
-            valid_starts = [(x[0], x[1]) for x in cache.keys() if x[0] == "<START>"]
-            w1, w2 = random.choice(valid_starts)
+            w1 = random.choice(self.cache[START])
+        w2 = random.choice(self.cache[w1])
 
-        gen_words = []
+        gen_words = [w1]
         while True:
-            if w2 == "<STOP>":
+            if w2 == STOP:
                 break
-            w1, w2 = w2, random.choice(cache[(w1, w2)])
+            w1, w2 = w2, random.choice(self.cache[(w1, w2)])
             gen_words.append(w1)
 
         message = ' '.join(gen_words)
@@ -47,67 +47,62 @@ def triples(cls, words):
         for i in range(len(words) - 2):
             yield (words[i], words[i+1], words[i+2])
 
-    def database(self, words: list, cache: dict):
-        for w1, w2, w3 in self.triples(words):
-            key = (w1, w2)
-            if key in cache:
-                if not (w3 in cache[key]):
-                    cache[key].append(w3)
+    def update_cache(self):
+        db = {START: set()}
+        next_word_is_start = True
+        for w1, w2, w3 in self.triples(self.words):
+            if w1 in (START, STOP) or w2 in (START, STOP):
+                next_word_is_start = True
+            else:
+                if next_word_is_start:
+                    db[START].add(w1)
+                    db.setdefault(w1, set()).add(w2)
+                    next_word_is_start = False
+                db.setdefault((w1, w2), set()).add(w3)
+        self.cache = {key: list(val) for key, val in db.items()}
+
+    @classmethod
+    def tokenize(cls, words: list):
+        yield START
+        for w in words:
+            if any(c in w for c in ('.', '?', '!')):
+                yield STOP
+                yield w.strip(".?!")
+                yield START
             else:
-                cache[key] = [w3]
-        return cache
+                yield w
+        yield STOP
 
     def learn(self, sentence: str):
-        tokens = sentence.split()
+        words = sentence.split()
 
         # strip, uppercase, and check for inclusion in IGNORE_WORDS list
         is_ignored = lambda x: x.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in self.ignore_words
-        tokens = [x for x in tokens if not is_ignored(x)]
-        if not tokens:
+        words = [x for x in words if not is_ignored(x)]
+        if not words:
             return  # nothing to learn here!
 
-        tokens[-1] = tokens[-1].strip(".?!")
-        tokens = [u"<START>", *tokens, u"<STOP>"]
-        indexes_with_stops = [tokens.index(x) for x in tokens if x.strip(".?!") != x]
-        for i in indexes_with_stops[::-1]:
-            tokens[i] = tokens[i].strip(".?!")
-            tokens.insert(i + 1, u"<STOP>")
-            tokens.insert(i + 2, u"<START>")
-
-        self.words += tokens
-        self.cache = self.database(self.words, {})
+        self.words += list(self.tokenize(words))
+        self.update_cache()
         lk = None
         if not self.skip_mp:
             lk = Lock()
-        # there must be a better way to serialize from the proxy ..
-        local_words = [word for word in self.words]
         with open(self.brain_file, 'w') as outfile:
             if not self.skip_mp:
                 lk.acquire()
-            outfile.write(yaml.dump(local_words, default_flow_style=True))
+            outfile.write(yaml.dump(list(self.words), default_flow_style=True))
             if not self.skip_mp:
                 lk.release()
 
     def create_response(self, prompt="", learn=False):
         # set seedword from somewhere in words if there's no prompt
-        prompt_tokens = prompt.split() or [random.choice(self.words)]
-
-        # create a set of lookups for phrases that start with words
-        # contained in prompt phrase
-        seed_tuples = [("<START>", tok) for tok in prompt_tokens[:-2]]
-
-        # lookup seeds in cache; compile a list of 'hits'
-        valid_seeds = [seed for seed in seed_tuples if seed in self.cache]
-
-        # either seed the lookup with a randomly selected valid seed,
-        # or if there were no 'hits' generate with no seedphrase
-        seed_phrase = random.choice(valid_seeds) if valid_seeds else None
-        response = self.generate_markov_text(self.words, self.cache, seed_phrase)
-
+        prompt_tokens = prompt.split()
+        valid_seeds = [tok for tok in prompt_tokens[:-2] if tok in self.cache[START]]
+        seed_word = random.choice(valid_seeds) if valid_seeds else None
+        response = self.generate_markov_text(seed_word)
         if learn:
-            if not self.skip_mp:
-                p = Process(target=self.learn, args=(prompt,))
-                p.start()
-            else:
+            if self.skip_mp:
                 self.learn(prompt)
+            else:
+                Process(target=self.learn, args=(prompt,)).start()
         return response

From 606baa634d8b6c01877e1e84a4ed2d430e584cd2 Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Wed, 25 Aug 2021 02:33:54 -0700
Subject: [PATCH 3/6] avoid linear operation when choosing seed

---
 markov.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/markov.py b/markov.py
index 26ffce4..8ecbcb5 100755
--- a/markov.py
+++ b/markov.py
@@ -51,7 +51,7 @@ def update_cache(self):
         db = {START: set()}
         next_word_is_start = True
         for w1, w2, w3 in self.triples(self.words):
-            if w1 in (START, STOP) or w2 in (START, STOP):
+            if w1 in (START, STOP) or w2 in (START, STOP) or w3 == START:
                 next_word_is_start = True
             else:
                 if next_word_is_start:
@@ -97,7 +97,7 @@ def learn(self, sentence: str):
     def create_response(self, prompt="", learn=False):
         # set seedword from somewhere in words if there's no prompt
         prompt_tokens = prompt.split()
-        valid_seeds = [tok for tok in prompt_tokens[:-2] if tok in self.cache[START]]
+        valid_seeds = [tok for tok in prompt_tokens[:-2] if tok in self.cache and tok != START]
         seed_word = random.choice(valid_seeds) if valid_seeds else None
         response = self.generate_markov_text(seed_word)
         if learn:

From 227d123e910d9625ed0f6068e9602b3e1b0e65a8 Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Wed, 25 Aug 2021 02:57:15 -0700
Subject: [PATCH 4/6] reorder methods

---
 markov.py | 58 +++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/markov.py b/markov.py
index 8ecbcb5..f610f2f 100755
--- a/markov.py
+++ b/markov.py
@@ -23,23 +23,6 @@ def load_corpus(cls, source_file: str):
         with open(source_file, 'r') as infile:
             return yaml.load(infile.read(), Loader=yaml.Loader)
 
-    def generate_markov_text(self, seed=None):
-        if seed:
-            w1 = seed
-        else:
-            w1 = random.choice(self.cache[START])
-        w2 = random.choice(self.cache[w1])
-
-        gen_words = [w1]
-        while True:
-            if w2 == STOP:
-                break
-            w1, w2 = w2, random.choice(self.cache[(w1, w2)])
-            gen_words.append(w1)
-
-        message = ' '.join(gen_words)
-        return message
-
     @classmethod
     def triples(cls, words):
         if len(words) < 3:
@@ -47,6 +30,18 @@ def triples(cls, words):
         for i in range(len(words) - 2):
             yield (words[i], words[i+1], words[i+2])
 
+    @classmethod
+    def tokenize(cls, words: list):
+        yield START
+        for w in words:
+            if any(c in w for c in ('.', '?', '!')):
+                yield STOP
+                yield w.strip(".?!")
+                yield START
+            else:
+                yield w
+        yield STOP
+
     def update_cache(self):
         db = {START: set()}
         next_word_is_start = True
@@ -61,18 +56,6 @@ def update_cache(self):
                 db.setdefault((w1, w2), set()).add(w3)
         self.cache = {key: list(val) for key, val in db.items()}
 
-    @classmethod
-    def tokenize(cls, words: list):
-        yield START
-        for w in words:
-            if any(c in w for c in ('.', '?', '!')):
-                yield STOP
-                yield w.strip(".?!")
-                yield START
-            else:
-                yield w
-        yield STOP
-
     def learn(self, sentence: str):
         words = sentence.split()
 
@@ -94,6 +77,23 @@ def learn(self, sentence: str):
             if not self.skip_mp:
                 lk.release()
 
+    def generate_markov_text(self, seed=None):
+        if seed:
+            w1 = seed
+        else:
+            w1 = random.choice(self.cache[START])
+        w2 = random.choice(self.cache[w1])
+
+        gen_words = [w1]
+        while True:
+            if w2 == STOP:
+                break
+            w1, w2 = w2, random.choice(self.cache[(w1, w2)])
+            gen_words.append(w1)
+
+        message = ' '.join(gen_words)
+        return message
+
     def create_response(self, prompt="", learn=False):
         # set seedword from somewhere in words if there's no prompt
         prompt_tokens = prompt.split()

From b64d4d99086436616ecb8c89ae82ac55c92d7146 Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Wed, 25 Aug 2021 10:34:06 -0700
Subject: [PATCH 5/6] incremental updates for in-memory markov db

---
 markov.py | 64 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/markov.py b/markov.py
index f610f2f..475dff9 100755
--- a/markov.py
+++ b/markov.py
@@ -30,8 +30,14 @@ def triples(cls, words):
         for i in range(len(words) - 2):
             yield (words[i], words[i+1], words[i+2])
 
-    @classmethod
-    def tokenize(cls, words: list):
+    def _ignore(self, word: str):
+        word.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in self.ignore_words
+
+    def tokenize(self, sentence: str):
+        words = [w for w in sentence.split() if not self._ignore(w)]
+        if not words:
+            return
+
         yield START
         for w in words:
             if any(c in w for c in ('.', '?', '!')):
@@ -42,31 +48,36 @@ def tokenize(cls, words: list):
                 yield w
         yield STOP
 
-    def update_cache(self):
-        db = {START: set()}
-        next_word_is_start = True
-        for w1, w2, w3 in self.triples(self.words):
+    def update_cache(self, new_sentence = None): 
+        if new_sentence:
+            db = self.cache
+            words = list(self.tokenize(new_sentence))
+        else:
+            db = {START: []}
+            words = self.words
+
+        start_of_chain = True
+        for w1, w2, w3 in self.triples(words):
             if w1 in (START, STOP) or w2 in (START, STOP) or w3 == START:
-                next_word_is_start = True
+                start_of_chain = True
             else:
-                if next_word_is_start:
-                    db[START].add(w1)
-                    db.setdefault(w1, set()).add(w2)
-                    next_word_is_start = False
-                db.setdefault((w1, w2), set()).add(w3)
-        self.cache = {key: list(val) for key, val in db.items()}
-
-    def learn(self, sentence: str):
-        words = sentence.split()
-
-        # strip, uppercase, and check for inclusion in IGNORE_WORDS list
-        is_ignored = lambda x: x.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in self.ignore_words
-        words = [x for x in words if not is_ignored(x)]
-        if not words:
-            return  # nothing to learn here!
+                if start_of_chain:
+                    if w1 not in db[START]:
+                        db[START].append(w1)
+                    next_words = db.setdefault(w1, [])
+                    if w2 not in next_words:
+                        next_words.append(w2)
+                    start_of_chain = False
+                next_words = db.setdefault((w1, w2), [])
+                if w3 not in next_words:
+                    next_words.append(w3)
+        self.cache = db
 
-        self.words += list(self.tokenize(words))
-        self.update_cache()
+    def update_corpus(self, sentence: str):
+        new_words = list(self.tokenize(sentence))
+        if not new_words:
+            return
+        self.words += new_words
         lk = None
         if not self.skip_mp:
             lk = Lock()
@@ -102,7 +113,8 @@ def create_response(self, prompt="", learn=False):
         response = self.generate_markov_text(seed_word)
         if learn:
             if self.skip_mp:
-                self.learn(prompt)
+                self.update_corpus(prompt)
             else:
-                Process(target=self.learn, args=(prompt,)).start()
+                Process(target=self.update_corpus, args=(prompt,)).start()
+            self.update_cache(prompt)
         return response

From a973d80db9acc6c093d6626b7323f2b44d4fe620 Mon Sep 17 00:00:00 2001
From: Matt Hanlon <matthanlon@gmail.com>
Date: Wed, 25 Aug 2021 14:30:34 -0700
Subject: [PATCH 6/6] eliminate multiprocessing and improve efficiency

- read brain from either yaml or plain text
- write learned corpus by appending to a plain text file
- use generators where possible to avoid big reads
- don't write new phrases if we don't learn from them
- get rid of <START> and <STOP> tokens, they can be implicit in the corpus

This should make us performant enough we can forgo multiprocessing and do everything in-process
---
 main.py   |  12 ++--
 markov.py | 183 +++++++++++++++++++++++++++++++-----------------------
 2 files changed, 113 insertions(+), 82 deletions(-)

diff --git a/main.py b/main.py
index 4238f7c..d083b96 100755
--- a/main.py
+++ b/main.py
@@ -32,15 +32,15 @@
 parser.add_argument('-b', '--brain',
                     env_var="CB_BRAIN",
                     required=True,
-                    help="This bot's brain as a YAML file.")
+                    help="This bot's input brain as a YAML or newline-delimited text file.")
+parser.add_argument('-o', '--output',
+                    env_var="CB_OUTPUT",
+                    required=True,
+                    help="File for writing the updated corpus")
 parser.add_argument('-n', '--name',
                     env_var="CB_NAME",
                     required=True,
                     help="The name this bot will respond to in chats.")
-parser.add_argument('--skip_mp',
-                    env_var="CB_SKIP_MP",
-                    action="http://github.com/store_true",
-                    help="Skip the multiprocess stuff that can hinder debugging.")
 args = parser.parse_args()
 
 discord_token = args.discord_token
@@ -48,7 +48,7 @@
 slack_app_token = args.slack_app_token
 
 bot_name = args.name
-brain = Markov(args.brain, bot_name.upper(), args.skip_mp)
+brain = Markov(args.brain, args.output, [bot_name])
 
 discord_client = discord.Client()
 
diff --git a/markov.py b/markov.py
index 475dff9..edb782c 100755
--- a/markov.py
+++ b/markov.py
@@ -1,105 +1,140 @@
-import yaml
 import random
-from multiprocess import Lock, Manager, Process
+import yaml
+from itertools import chain, groupby
+
+START_TOK = "<START>"
+STOP_TOK = "<STOP>"
 
-START = "<START>"
-STOP = "<STOP>"
+STOP = object()
+START = object()
 
 # instantiate a Markov object with the source file
 class Markov:
-    def __init__(self, brain_file: str, ignore_words, skip_mp=False):
-        self.brain_file = brain_file
-        self.ignore_words = ignore_words
-        self.skip_mp = skip_mp
-        if self.skip_mp:
-            self.words = list(self.load_corpus(brain_file))
-        else:
-            self.manager = Manager()
-            self.words = self.manager.list(self.load_corpus(brain_file))
-        self.update_cache()
+    def __init__(self, input_file: str, output_file: str, ignore_words):
+        if input_file == output_file:
+            raise ValueError("input and output files must be different")
 
-    @classmethod
-    def load_corpus(cls, source_file: str):
+        self.ignore_words = set(w.upper() for w in ignore_words)
+        self.output_file = output_file
+        self.update_graph_and_corpus(self.corpus_iter(input_file), init=True)
+
+    def corpus_iter(self, source_file: str):
+        """
+        Emit the contents of the source_file as an iterable of token sequences
+        """
         with open(source_file, 'r') as infile:
-            return yaml.load(infile.read(), Loader=yaml.Loader)
+            # this is dumb
+            if source_file.endswith(".yml") or source_file.endswith(".yaml"):
+                words = yaml.load(infile.read(), Loader=yaml.Loader)
+                for is_delim, phrase in groupby(words, lambda w: w in (START_TOK, STOP_TOK)):
+                    if not is_delim:
+                        yield list(phrase)
+            else:
+                for line in infile:
+                    yield from self.tokenize(line)
+
 
     @classmethod
-    def triples(cls, words):
-        if len(words) < 3:
+    def triples_and_stop(cls, words):
+        """
+        Emit 3-grams from the sequence of words, the last one ending with the
+        special STOP token
+        """
+        words = chain(words, [STOP])
+        try:
+            w1 = next(words)
+            w2 = next(words)
+            w3 = next(words)
+            while True:
+                yield (w1, w2, w3)
+                w1, w2, w3 = w2, w3, next(words)
+        except StopIteration:
             return
-        for i in range(len(words) - 2):
-            yield (words[i], words[i+1], words[i+2])
 
     def _ignore(self, word: str):
-        word.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in self.ignore_words
+        return word.strip("\'\"!@#$%^&*().,/\\+=<>?:;").upper() in self.ignore_words
 
     def tokenize(self, sentence: str):
-        words = [w for w in sentence.split() if not self._ignore(w)]
-        if not words:
-            return
+        """
+        Emit a sequence of token lists from the string, ignoring ignore_words.
+        A word ending in certain puntuation ends a given token sequence.
+        """
+        cur = []
+        for w in sentence.split():
+            if self._ignore(w):
+                pass
 
-        yield START
-        for w in words:
-            if any(c in w for c in ('.', '?', '!')):
-                yield STOP
-                yield w.strip(".?!")
-                yield START
+            elif any(w.endswith(c) for c in ('.', '?', '!')):
+                w = w.strip(".?!")
+                if w:
+                    cur.append(w)
+                yield(cur)
+                cur = []
             else:
-                yield w
-        yield STOP
+                cur.append(w)
+        if cur:
+            yield cur
 
-    def update_cache(self, new_sentence = None): 
-        if new_sentence:
-            db = self.cache
-            words = list(self.tokenize(new_sentence))
-        else:
-            db = {START: []}
-            words = self.words
+    def _update_graph_and_emit_changes(self, token_seqs, init=False):
+        """
+        self.graph stores the graph of n-gram trasitions.
+        The keys are single tokens or pairs and the values possible next words in the n-gram.
+        Initial tokens are also specially added to the list at the key START.
 
-        start_of_chain = True
-        for w1, w2, w3 in self.triples(words):
-            if w1 in (START, STOP) or w2 in (START, STOP) or w3 == START:
-                start_of_chain = True
-            else:
-                if start_of_chain:
-                    if w1 not in db[START]:
-                        db[START].append(w1)
-                    next_words = db.setdefault(w1, [])
+        _update_graph_and_emit_changes returns a generator that when run will
+        update the graph with the ngrams taken from each element of token_seqs.
+
+        Yields the token sequence that result in updates so they can be further
+        acted on.
+
+        if init is True reinitialize from an empty graph
+        """
+        if init:
+            self.graph = {START: []}
+
+        for seq in token_seqs:
+            first = True
+            learned = False
+            for w1, w2, w3 in self.triples_and_stop(seq):
+                if first:
+                    if w1 not in self.graph[START]:
+                        self.graph[START].append(w1)
+                        learned = True
+                    next_words = self.graph.setdefault(w1, [])
                     if w2 not in next_words:
                         next_words.append(w2)
-                    start_of_chain = False
-                next_words = db.setdefault((w1, w2), [])
+                        learned = True
+                    first = False
+                next_words = self.graph.setdefault((w1, w2), [])
                 if w3 not in next_words:
                     next_words.append(w3)
-        self.cache = db
+                    learned = True
+            if learned:
+                yield seq
 
-    def update_corpus(self, sentence: str):
-        new_words = list(self.tokenize(sentence))
-        if not new_words:
-            return
-        self.words += new_words
-        lk = None
-        if not self.skip_mp:
-            lk = Lock()
-        with open(self.brain_file, 'w') as outfile:
-            if not self.skip_mp:
-                lk.acquire()
-            outfile.write(yaml.dump(list(self.words), default_flow_style=True))
-            if not self.skip_mp:
-                lk.release()
+    def update_graph_and_corpus(self, token_seqs, init=False):
+        changes = self._update_graph_and_emit_changes(token_seqs, init=init)
+        self.update_corpus(changes, init=init)
+
+    def update_corpus(self, token_seqs, init=False):
+        mode = 'w' if init else 'a'
+        with open(self.output_file, mode) as f:
+            for seq in token_seqs:
+                f.write(" ".join(seq))
+                f.write("\n")
 
     def generate_markov_text(self, seed=None):
-        if seed:
+        if seed and seed in self.graph:
             w1 = seed
         else:
-            w1 = random.choice(self.cache[START])
-        w2 = random.choice(self.cache[w1])
+            w1 = random.choice(self.graph[START])
+        w2 = random.choice(self.graph[w1])
 
         gen_words = [w1]
         while True:
             if w2 == STOP:
                 break
-            w1, w2 = w2, random.choice(self.cache[(w1, w2)])
+            w1, w2 = w2, random.choice(self.graph[(w1, w2)])
             gen_words.append(w1)
 
         message = ' '.join(gen_words)
@@ -108,13 +143,9 @@ def generate_markov_text(self, seed=None):
     def create_response(self, prompt="", learn=False):
         # set seedword from somewhere in words if there's no prompt
         prompt_tokens = prompt.split()
-        valid_seeds = [tok for tok in prompt_tokens[:-2] if tok in self.cache and tok != START]
+        valid_seeds = [tok for tok in prompt_tokens[:-2] if tok in self.graph]
         seed_word = random.choice(valid_seeds) if valid_seeds else None
         response = self.generate_markov_text(seed_word)
         if learn:
-            if self.skip_mp:
-                self.update_corpus(prompt)
-            else:
-                Process(target=self.update_corpus, args=(prompt,)).start()
-            self.update_cache(prompt)
+            self.update_graph_and_corpus(self.tokenize(prompt))
         return response