update 6.用图挖掘找到感兴趣的人.ipynb

LinXueyuanStdio · LinXueyuanStdio · commit 355892e90d25 · 2017-11-12T18:38:36.000+08:00
diff --git a/《Python数据挖掘入门与实践》/6.用图挖掘找到感兴趣的人.ipynb b/《Python数据挖掘入门与实践》/6.用图挖掘找到感兴趣的人.ipynb
@@ -392,6 +392,15 @@
     "# 寻找子图"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "hidden": true
+   },
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -714,39 +723,342 @@
     "compute_silhouette(0.25, friends)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 第二部分"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "hidden": true
-   },
+   "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
+    "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
+    "with open(friends_filename) as inf:\n",
+    "    friends = json.load(inf)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "hidden": true
-   },
+   "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "friends = {user: set(friends[user]) for user in friends}"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "hidden": true
-   },
+   "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "def compute_similarity(friends1, friends2):\n",
+    "    set_friends1 = set(friends1)\n",
+    "    set_friends2 = set(friends2)\n",
+    "    return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "hidden": true
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import networkx as nx\n",
+    "def create_graph(friends, threshold=0):\n",
+    "    G = nx.Graph()\n",
+    "    weights = []\n",
+    "    for user1 in friends.keys():\n",
+    "        for user2 in friends.keys():\n",
+    "            if user1 == user2:\n",
+    "                continue\n",
+    "            weight = compute_similarity(friends[user1], friends[user2])\n",
+    "            weights.append(weight)\n",
+    "            if weight >= threshold:\n",
+    "                G.add_node(user1)\n",
+    "                G.add_node(user2)\n",
+    "                G.add_edge(user1, user2, weight=weight)\n",
+    "    return G\n",
+    "\n",
+    "G = create_graph(friends, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from matplotlib import pyplot as plt\n",
+    "plt.figure(figsize=(10,10))\n",
+    "pos = nx.spring_layout(G)\n",
+    "nx.draw_networkx_nodes(G, pos, node_size=500)\n",
+    "\n",
+    "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
+    "nx.draw_networkx_edges(G, pos, width=edgewidth)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = create_graph(friends, 0.1)\n",
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    n_nodes = len(sub_graph.nodes())\n",
+    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = create_graph(friends, 0.15)\n",
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    n_nodes = len(sub_graph.nodes())\n",
+    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "label_dict = {}\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    for node in sub_graph.nodes():\n",
+    "        label_dict[node] = i\n",
+    "labels = [label_dict[node] for node in G.nodes()]\n",
+    "\n",
+    "plt.figure(figsize=(10,10))\n",
+    "nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "plt.figure(figsize=(10,10))\n",
+    "pos = nx.spring_layout(G)\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    nodes = sub_graph.nodes()\n",
+    "    edges = sub_graph.edges()\n",
+    "    nx.draw_networkx_nodes(G, pos, nodes,node_size=500)\n",
+    "    nx.draw_networkx_edges(G, pos, edges)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "n_subgraphs = nx.number_connected_components(G)\n",
+    "\n",
+    "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n",
+    "    ax.get_xaxis().set_visible(False)\n",
+    "    ax.get_yaxis().set_visible(False)\n",
+    "    pos = nx.spring_layout(G)\n",
+    "    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n",
+    "    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from sklearn.metrics import silhouette_score\n",
+    "import numpy as np\n",
+    "\n",
+    "def compute_silhouette(threshold, friends):\n",
+    "    G = create_graph(friends, threshold=threshold)\n",
+    "    if len(G.nodes()) == 0:\n",
+    "        return -99  # Invalid graph\n",
+    "    sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n",
+    "        return -99  # Invalid number of components, Silhouette not defined\n",
+    "    label_dict = {}\n",
+    "    for i, sub_graph in enumerate(sub_graphs):\n",
+    "        for node in sub_graph.nodes():\n",
+    "            label_dict[node] = i\n",
+    "    labels = np.array([label_dict[node] for node in G.nodes()])\n",
+    "    X = nx.to_scipy_sparse_matrix(G).todense()\n",
+    "    X = 1 - X\n",
+    "    return silhouette_score(X, labels, metric='precomputed')\n",
+    "\n",
+    "\n",
+    "print(compute_silhouette(0.1, friends))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.optimize import minimize #(fun, x0, args=(),\n",
+    "\n",
+    "def invert(func):\n",
+    "    def inverted_function(*args, **kwds):\n",
+    "        return -func(*args, **kwds)\n",
+    "    return inverted_function\n",
+    "\n",
+    "result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = create_graph(friends, threshold=0.135)\n",
+    "sub_graphs = nx.connected_component_subgraphs(G)\n",
+    "\n",
+    "for i, sub_graph in enumerate(sub_graphs):\n",
+    "    n_nodes = len(sub_graph.nodes())\n",
+    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = 1-nx.to_scipy_sparse_matrix(G).todense()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def silhouette_score(X, labels, metric='precomputed'):\n",
+    "    labels = np.array(labels)\n",
+    "    print(labels.shape)\n",
+    "    return np.mean(silhouette_samples(X, labels, metric=metric))\n",
+    "\n",
+    "def silhouette_samples(X, labels, metric='precomputed'):\n",
+    "    print(X.shape)\n",
+    "    distances = X  #pairwise_distances(X, metric=metric, **kwds)\n",
+    "    n = labels.shape[0]\n",
+    "    A = np.array([_intra_cluster_distance(distances[i], labels, i)\n",
+    "                  for i in range(n)])\n",
+    "    B = np.array([_nearest_cluster_distance(distances[i], labels, i)\n",
+    "                  for i in range(n)])\n",
+    "    sil_samples = (B - A) / np.maximum(A, B)\n",
+    "    # nan values are for clusters of size 1, and should be 0\n",
+    "    return np.nan_to_num(sil_samples)\n",
+    "\n",
+    "def _intra_cluster_distance(distances_row, labels, i):\n",
+    "    \"\"\"Calculate the mean intra-cluster distance for sample i.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    distances_row : array, shape = [n_samples]\n",
+    "        Pairwise distance matrix between sample i and each sample.\n",
+    "\n",
+    "    labels : array, shape = [n_samples]\n",
+    "        label values for each sample\n",
+    "\n",
+    "    i : int\n",
+    "        Sample index being calculated. It is excluded from calculation and\n",
+    "        used to determine the current label\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    a : float\n",
+    "        Mean intra-cluster distance for sample i\n",
+    "    \"\"\"\n",
+    "    mask = (labels == labels[i])\n",
+    "    mask[i] = False\n",
+    "    mask = mask.reshape(distances_row.shape)\n",
+    "    #print(\"Cluster {}\".format(i))\n",
+    "    #print(mask)\n",
+    "    #print(distances_row.flatten())\n",
+    "    #print(distances_row.flatten()[mask])\n",
+    "    a = np.mean(distances_row[mask])\n",
+    "    return a\n",
+    "\n",
+    "\n",
+    "def _nearest_cluster_distance(distances_row, labels, i):\n",
+    "    \"\"\"Calculate the mean nearest-cluster distance for sample i.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    distances_row : array, shape = [n_samples]\n",
+    "        Pairwise distance matrix between sample i and each sample.\n",
+    "\n",
+    "    labels : array, shape = [n_samples]\n",
+    "        label values for each sample\n",
+    "\n",
+    "    i : int\n",
+    "        Sample index being calculated. It is used to determine the current\n",
+    "        label.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    b : float\n",
+    "        Mean nearest-cluster distance for sample i\n",
+    "    \"\"\"\n",
+    "    label = labels[i]\n",
+    "    b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])\n",
+    "               for cur_label in set(labels) if not cur_label == label])\n",
+    "    return b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "silhouette_score(X, labels, metric='precomputed')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": []
   }