Skip to content

Commit 355892e

Browse files
update 6.用图挖掘找到感兴趣的人.ipynb
1 parent 9f1fc0f commit 355892e

File tree

1 file changed

+327
-15
lines changed

1 file changed

+327
-15
lines changed

《Python数据挖掘入门与实践》/6.用图挖掘找到感兴趣的人.ipynb

Lines changed: 327 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,15 @@
392392
"# 寻找子图"
393393
]
394394
},
395+
{
396+
"cell_type": "code",
397+
"execution_count": null,
398+
"metadata": {
399+
"hidden": true
400+
},
401+
"outputs": [],
402+
"source": []
403+
},
395404
{
396405
"cell_type": "code",
397406
"execution_count": null,
@@ -714,39 +723,342 @@
714723
"compute_silhouette(0.25, friends)"
715724
]
716725
},
726+
{
727+
"cell_type": "markdown",
728+
"metadata": {},
729+
"source": [
730+
"# 第二部分"
731+
]
732+
},
717733
{
718734
"cell_type": "code",
719735
"execution_count": null,
720-
"metadata": {
721-
"hidden": true
722-
},
736+
"metadata": {},
723737
"outputs": [],
724-
"source": []
738+
"source": [
739+
"import os\n",
740+
"import json\n",
741+
"\n",
742+
"data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
743+
"friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
744+
"with open(friends_filename) as inf:\n",
745+
" friends = json.load(inf)"
746+
]
725747
},
726748
{
727749
"cell_type": "code",
728750
"execution_count": null,
729-
"metadata": {
730-
"hidden": true
731-
},
751+
"metadata": {},
732752
"outputs": [],
733-
"source": []
753+
"source": [
754+
"friends = {user: set(friends[user]) for user in friends}"
755+
]
734756
},
735757
{
736758
"cell_type": "code",
737759
"execution_count": null,
738-
"metadata": {
739-
"hidden": true
740-
},
760+
"metadata": {},
741761
"outputs": [],
742-
"source": []
762+
"source": [
763+
"def compute_similarity(friends1, friends2):\n",
764+
" set_friends1 = set(friends1)\n",
765+
" set_friends2 = set(friends2)\n",
766+
" return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)"
767+
]
743768
},
744769
{
745770
"cell_type": "code",
746771
"execution_count": null,
747-
"metadata": {
748-
"hidden": true
749-
},
772+
"metadata": {},
773+
"outputs": [],
774+
"source": [
775+
"import networkx as nx\n",
776+
"def create_graph(friends, threshold=0):\n",
777+
" G = nx.Graph()\n",
778+
" weights = []\n",
779+
" for user1 in friends.keys():\n",
780+
" for user2 in friends.keys():\n",
781+
" if user1 == user2:\n",
782+
" continue\n",
783+
" weight = compute_similarity(friends[user1], friends[user2])\n",
784+
" weights.append(weight)\n",
785+
" if weight >= threshold:\n",
786+
" G.add_node(user1)\n",
787+
" G.add_node(user2)\n",
788+
" G.add_edge(user1, user2, weight=weight)\n",
789+
" return G\n",
790+
"\n",
791+
"G = create_graph(friends, 0)"
792+
]
793+
},
794+
{
795+
"cell_type": "code",
796+
"execution_count": null,
797+
"metadata": {},
798+
"outputs": [],
799+
"source": [
800+
"%matplotlib inline\n",
801+
"from matplotlib import pyplot as plt\n",
802+
"plt.figure(figsize=(10,10))\n",
803+
"pos = nx.spring_layout(G)\n",
804+
"nx.draw_networkx_nodes(G, pos, node_size=500)\n",
805+
"\n",
806+
"edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
807+
"nx.draw_networkx_edges(G, pos, width=edgewidth)"
808+
]
809+
},
810+
{
811+
"cell_type": "code",
812+
"execution_count": null,
813+
"metadata": {},
814+
"outputs": [],
815+
"source": [
816+
"G = create_graph(friends, 0.1)\n",
817+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
818+
"\n",
819+
"for i, sub_graph in enumerate(sub_graphs):\n",
820+
" n_nodes = len(sub_graph.nodes())\n",
821+
" print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
822+
]
823+
},
824+
{
825+
"cell_type": "code",
826+
"execution_count": null,
827+
"metadata": {},
828+
"outputs": [],
829+
"source": [
830+
"G = create_graph(friends, 0.15)\n",
831+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
832+
"\n",
833+
"for i, sub_graph in enumerate(sub_graphs):\n",
834+
" n_nodes = len(sub_graph.nodes())\n",
835+
" print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
836+
]
837+
},
838+
{
839+
"cell_type": "code",
840+
"execution_count": null,
841+
"metadata": {},
842+
"outputs": [],
843+
"source": [
844+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
845+
"label_dict = {}\n",
846+
"for i, sub_graph in enumerate(sub_graphs):\n",
847+
" for node in sub_graph.nodes():\n",
848+
" label_dict[node] = i\n",
849+
"labels = [label_dict[node] for node in G.nodes()]\n",
850+
"\n",
851+
"plt.figure(figsize=(10,10))\n",
852+
"nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)"
853+
]
854+
},
855+
{
856+
"cell_type": "code",
857+
"execution_count": null,
858+
"metadata": {},
859+
"outputs": [],
860+
"source": [
861+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
862+
"plt.figure(figsize=(10,10))\n",
863+
"pos = nx.spring_layout(G)\n",
864+
"for i, sub_graph in enumerate(sub_graphs):\n",
865+
" nodes = sub_graph.nodes()\n",
866+
" edges = sub_graph.edges()\n",
867+
" nx.draw_networkx_nodes(G, pos, nodes,node_size=500)\n",
868+
" nx.draw_networkx_edges(G, pos, edges)\n",
869+
" "
870+
]
871+
},
872+
{
873+
"cell_type": "code",
874+
"execution_count": null,
875+
"metadata": {},
876+
"outputs": [],
877+
"source": [
878+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
879+
"n_subgraphs = nx.number_connected_components(G)\n",
880+
"\n",
881+
"fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n",
882+
"for i, sub_graph in enumerate(sub_graphs):\n",
883+
" ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n",
884+
" ax.get_xaxis().set_visible(False)\n",
885+
" ax.get_yaxis().set_visible(False)\n",
886+
" pos = nx.spring_layout(G)\n",
887+
" nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n",
888+
" nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n"
889+
]
890+
},
891+
{
892+
"cell_type": "code",
893+
"execution_count": null,
894+
"metadata": {},
895+
"outputs": [],
896+
"source": [
897+
"#from sklearn.metrics import silhouette_score\n",
898+
"import numpy as np\n",
899+
"\n",
900+
"def compute_silhouette(threshold, friends):\n",
901+
" G = create_graph(friends, threshold=threshold)\n",
902+
" if len(G.nodes()) == 0:\n",
903+
" return -99 # Invalid graph\n",
904+
" sub_graphs = nx.connected_component_subgraphs(G)\n",
905+
" if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n",
906+
" return -99 # Invalid number of components, Silhouette not defined\n",
907+
" label_dict = {}\n",
908+
" for i, sub_graph in enumerate(sub_graphs):\n",
909+
" for node in sub_graph.nodes():\n",
910+
" label_dict[node] = i\n",
911+
" labels = np.array([label_dict[node] for node in G.nodes()])\n",
912+
" X = nx.to_scipy_sparse_matrix(G).todense()\n",
913+
" X = 1 - X\n",
914+
" return silhouette_score(X, labels, metric='precomputed')\n",
915+
"\n",
916+
"\n",
917+
"print(compute_silhouette(0.1, friends))\n"
918+
]
919+
},
920+
{
921+
"cell_type": "code",
922+
"execution_count": null,
923+
"metadata": {},
924+
"outputs": [],
925+
"source": [
926+
"from scipy.optimize import minimize #(fun, x0, args=(),\n",
927+
"\n",
928+
"def invert(func):\n",
929+
" def inverted_function(*args, **kwds):\n",
930+
" return -func(*args, **kwds)\n",
931+
" return inverted_function\n",
932+
"\n",
933+
"result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })\n",
934+
"print(result)"
935+
]
936+
},
937+
{
938+
"cell_type": "code",
939+
"execution_count": null,
940+
"metadata": {},
941+
"outputs": [],
942+
"source": [
943+
"G = create_graph(friends, threshold=0.135)\n",
944+
"sub_graphs = nx.connected_component_subgraphs(G)\n",
945+
"\n",
946+
"for i, sub_graph in enumerate(sub_graphs):\n",
947+
" n_nodes = len(sub_graph.nodes())\n",
948+
" print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
949+
]
950+
},
951+
{
952+
"cell_type": "code",
953+
"execution_count": null,
954+
"metadata": {},
955+
"outputs": [],
956+
"source": [
957+
"labels"
958+
]
959+
},
960+
{
961+
"cell_type": "code",
962+
"execution_count": null,
963+
"metadata": {},
964+
"outputs": [],
965+
"source": [
966+
"X = 1-nx.to_scipy_sparse_matrix(G).todense()"
967+
]
968+
},
969+
{
970+
"cell_type": "code",
971+
"execution_count": null,
972+
"metadata": {},
973+
"outputs": [],
974+
"source": [
975+
"def silhouette_score(X, labels, metric='precomputed'):\n",
976+
" labels = np.array(labels)\n",
977+
" print(labels.shape)\n",
978+
" return np.mean(silhouette_samples(X, labels, metric=metric))\n",
979+
"\n",
980+
"def silhouette_samples(X, labels, metric='precomputed'):\n",
981+
" print(X.shape)\n",
982+
" distances = X #pairwise_distances(X, metric=metric, **kwds)\n",
983+
" n = labels.shape[0]\n",
984+
" A = np.array([_intra_cluster_distance(distances[i], labels, i)\n",
985+
" for i in range(n)])\n",
986+
" B = np.array([_nearest_cluster_distance(distances[i], labels, i)\n",
987+
" for i in range(n)])\n",
988+
" sil_samples = (B - A) / np.maximum(A, B)\n",
989+
" # nan values are for clusters of size 1, and should be 0\n",
990+
" return np.nan_to_num(sil_samples)\n",
991+
"\n",
992+
"def _intra_cluster_distance(distances_row, labels, i):\n",
993+
" \"\"\"Calculate the mean intra-cluster distance for sample i.\n",
994+
"\n",
995+
" Parameters\n",
996+
" ----------\n",
997+
" distances_row : array, shape = [n_samples]\n",
998+
" Pairwise distance matrix between sample i and each sample.\n",
999+
"\n",
1000+
" labels : array, shape = [n_samples]\n",
1001+
" label values for each sample\n",
1002+
"\n",
1003+
" i : int\n",
1004+
" Sample index being calculated. It is excluded from calculation and\n",
1005+
" used to determine the current label\n",
1006+
"\n",
1007+
" Returns\n",
1008+
" -------\n",
1009+
" a : float\n",
1010+
" Mean intra-cluster distance for sample i\n",
1011+
" \"\"\"\n",
1012+
" mask = (labels == labels[i])\n",
1013+
" mask[i] = False\n",
1014+
" mask = mask.reshape(distances_row.shape)\n",
1015+
" #print(\"Cluster {}\".format(i))\n",
1016+
" #print(mask)\n",
1017+
" #print(distances_row.flatten())\n",
1018+
" #print(distances_row.flatten()[mask])\n",
1019+
" a = np.mean(distances_row[mask])\n",
1020+
" return a\n",
1021+
"\n",
1022+
"\n",
1023+
"def _nearest_cluster_distance(distances_row, labels, i):\n",
1024+
" \"\"\"Calculate the mean nearest-cluster distance for sample i.\n",
1025+
"\n",
1026+
" Parameters\n",
1027+
" ----------\n",
1028+
" distances_row : array, shape = [n_samples]\n",
1029+
" Pairwise distance matrix between sample i and each sample.\n",
1030+
"\n",
1031+
" labels : array, shape = [n_samples]\n",
1032+
" label values for each sample\n",
1033+
"\n",
1034+
" i : int\n",
1035+
" Sample index being calculated. It is used to determine the current\n",
1036+
" label.\n",
1037+
"\n",
1038+
" Returns\n",
1039+
" -------\n",
1040+
" b : float\n",
1041+
" Mean nearest-cluster distance for sample i\n",
1042+
" \"\"\"\n",
1043+
" label = labels[i]\n",
1044+
" b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])\n",
1045+
" for cur_label in set(labels) if not cur_label == label])\n",
1046+
" return b"
1047+
]
1048+
},
1049+
{
1050+
"cell_type": "code",
1051+
"execution_count": null,
1052+
"metadata": {},
1053+
"outputs": [],
1054+
"source": [
1055+
"silhouette_score(X, labels, metric='precomputed')"
1056+
]
1057+
},
1058+
{
1059+
"cell_type": "code",
1060+
"execution_count": null,
1061+
"metadata": {},
7501062
"outputs": [],
7511063
"source": []
7521064
}

0 commit comments

Comments
 (0)