Add LDA file

susanli2016 · web-flow · commit 79b38dfdd0fe · 2017-07-16T20:56:46.000-04:00
diff --git a/LDA.ipynb b/LDA.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from stop_words import get_stop_words\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from gensim import corpora, models\n",
+    "import gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = RegexpTokenizer(r'\\w+')\n",
+    "\n",
+    "# create English stop words list\n",
+    "en_stop = get_stop_words('en')\n",
+    "\n",
+    "# Create p_stemmer of class PorterStemmer\n",
+    "p_stemmer = PorterStemmer()\n",
+    "    \n",
+    "# create sample documents\n",
+    "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
+    "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
+    "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
+    "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
+    "doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
+    "\n",
+    "# compile sample documents into a list\n",
+    "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
+    "\n",
+    "# list for tokenized documents in loop\n",
+    "texts = []\n",
+    "\n",
+    "# loop through document list\n",
+    "for i in doc_set:\n",
+    "    \n",
+    "    # clean and tokenize document string\n",
+    "    raw = i.lower()\n",
+    "    tokens = tokenizer.tokenize(raw)\n",
+    "\n",
+    "    # remove stop words from tokens\n",
+    "    stopped_tokens = [i for i in tokens if not i in en_stop]\n",
+    "    \n",
+    "    # stem tokens\n",
+    "    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
+    "    \n",
+    "    # add tokens to list\n",
+    "    texts.append(stemmed_tokens)\n",
+    "\n",
+    "# turn our tokenized documents into a id <-> term dictionary\n",
+    "dictionary = corpora.Dictionary(texts)\n",
+    "    \n",
+    "# convert tokenized documents into a document-term matrix\n",
+    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
+    "\n",
+    "# generate LDA model\n",
+    "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=2, num_words=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=3))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}