|
392 | 392 | "# 寻找子图"
|
393 | 393 | ]
|
394 | 394 | },
|
| 395 | + { |
| 396 | + "cell_type": "code", |
| 397 | + "execution_count": null, |
| 398 | + "metadata": { |
| 399 | + "hidden": true |
| 400 | + }, |
| 401 | + "outputs": [], |
| 402 | + "source": [] |
| 403 | + }, |
395 | 404 | {
|
396 | 405 | "cell_type": "code",
|
397 | 406 | "execution_count": null,
|
|
714 | 723 | "compute_silhouette(0.25, friends)"
|
715 | 724 | ]
|
716 | 725 | },
|
| 726 | + { |
| 727 | + "cell_type": "markdown", |
| 728 | + "metadata": {}, |
| 729 | + "source": [ |
| 730 | + "# 第二部分" |
| 731 | + ] |
| 732 | + }, |
717 | 733 | {
|
718 | 734 | "cell_type": "code",
|
719 | 735 | "execution_count": null,
|
720 |
| - "metadata": { |
721 |
| - "hidden": true |
722 |
| - }, |
| 736 | + "metadata": {}, |
723 | 737 | "outputs": [],
|
724 |
| - "source": [] |
| 738 | + "source": [ |
| 739 | + "import os\n", |
| 740 | + "import json\n", |
| 741 | + "\n", |
| 742 | + "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n", |
| 743 | + "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n", |
| 744 | + "with open(friends_filename) as inf:\n", |
| 745 | + " friends = json.load(inf)" |
| 746 | + ] |
725 | 747 | },
|
726 | 748 | {
|
727 | 749 | "cell_type": "code",
|
728 | 750 | "execution_count": null,
|
729 |
| - "metadata": { |
730 |
| - "hidden": true |
731 |
| - }, |
| 751 | + "metadata": {}, |
732 | 752 | "outputs": [],
|
733 |
| - "source": [] |
| 753 | + "source": [ |
| 754 | + "friends = {user: set(friends[user]) for user in friends}" |
| 755 | + ] |
734 | 756 | },
|
735 | 757 | {
|
736 | 758 | "cell_type": "code",
|
737 | 759 | "execution_count": null,
|
738 |
| - "metadata": { |
739 |
| - "hidden": true |
740 |
| - }, |
| 760 | + "metadata": {}, |
741 | 761 | "outputs": [],
|
742 |
| - "source": [] |
| 762 | + "source": [ |
| 763 | + "def compute_similarity(friends1, friends2):\n", |
| 764 | + " set_friends1 = set(friends1)\n", |
| 765 | + " set_friends2 = set(friends2)\n", |
| 766 | + " return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)" |
| 767 | + ] |
743 | 768 | },
|
744 | 769 | {
|
745 | 770 | "cell_type": "code",
|
746 | 771 | "execution_count": null,
|
747 |
| - "metadata": { |
748 |
| - "hidden": true |
749 |
| - }, |
| 772 | + "metadata": {}, |
| 773 | + "outputs": [], |
| 774 | + "source": [ |
| 775 | + "import networkx as nx\n", |
| 776 | + "def create_graph(friends, threshold=0):\n", |
| 777 | + " G = nx.Graph()\n", |
| 778 | + " weights = []\n", |
| 779 | + " for user1 in friends.keys():\n", |
| 780 | + " for user2 in friends.keys():\n", |
| 781 | + " if user1 == user2:\n", |
| 782 | + " continue\n", |
| 783 | + " weight = compute_similarity(friends[user1], friends[user2])\n", |
| 784 | + " weights.append(weight)\n", |
| 785 | + " if weight >= threshold:\n", |
| 786 | + " G.add_node(user1)\n", |
| 787 | + " G.add_node(user2)\n", |
| 788 | + " G.add_edge(user1, user2, weight=weight)\n", |
| 789 | + " return G\n", |
| 790 | + "\n", |
| 791 | + "G = create_graph(friends, 0)" |
| 792 | + ] |
| 793 | + }, |
| 794 | + { |
| 795 | + "cell_type": "code", |
| 796 | + "execution_count": null, |
| 797 | + "metadata": {}, |
| 798 | + "outputs": [], |
| 799 | + "source": [ |
| 800 | + "%matplotlib inline\n", |
| 801 | + "from matplotlib import pyplot as plt\n", |
| 802 | + "plt.figure(figsize=(10,10))\n", |
| 803 | + "pos = nx.spring_layout(G)\n", |
| 804 | + "nx.draw_networkx_nodes(G, pos, node_size=500)\n", |
| 805 | + "\n", |
| 806 | + "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n", |
| 807 | + "nx.draw_networkx_edges(G, pos, width=edgewidth)" |
| 808 | + ] |
| 809 | + }, |
| 810 | + { |
| 811 | + "cell_type": "code", |
| 812 | + "execution_count": null, |
| 813 | + "metadata": {}, |
| 814 | + "outputs": [], |
| 815 | + "source": [ |
| 816 | + "G = create_graph(friends, 0.1)\n", |
| 817 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 818 | + "\n", |
| 819 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 820 | + " n_nodes = len(sub_graph.nodes())\n", |
| 821 | + " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" |
| 822 | + ] |
| 823 | + }, |
| 824 | + { |
| 825 | + "cell_type": "code", |
| 826 | + "execution_count": null, |
| 827 | + "metadata": {}, |
| 828 | + "outputs": [], |
| 829 | + "source": [ |
| 830 | + "G = create_graph(friends, 0.15)\n", |
| 831 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 832 | + "\n", |
| 833 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 834 | + " n_nodes = len(sub_graph.nodes())\n", |
| 835 | + " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" |
| 836 | + ] |
| 837 | + }, |
| 838 | + { |
| 839 | + "cell_type": "code", |
| 840 | + "execution_count": null, |
| 841 | + "metadata": {}, |
| 842 | + "outputs": [], |
| 843 | + "source": [ |
| 844 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 845 | + "label_dict = {}\n", |
| 846 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 847 | + " for node in sub_graph.nodes():\n", |
| 848 | + " label_dict[node] = i\n", |
| 849 | + "labels = [label_dict[node] for node in G.nodes()]\n", |
| 850 | + "\n", |
| 851 | + "plt.figure(figsize=(10,10))\n", |
| 852 | + "nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)" |
| 853 | + ] |
| 854 | + }, |
| 855 | + { |
| 856 | + "cell_type": "code", |
| 857 | + "execution_count": null, |
| 858 | + "metadata": {}, |
| 859 | + "outputs": [], |
| 860 | + "source": [ |
| 861 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 862 | + "plt.figure(figsize=(10,10))\n", |
| 863 | + "pos = nx.spring_layout(G)\n", |
| 864 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 865 | + " nodes = sub_graph.nodes()\n", |
| 866 | + " edges = sub_graph.edges()\n", |
| 867 | + " nx.draw_networkx_nodes(G, pos, nodes,node_size=500)\n", |
| 868 | + " nx.draw_networkx_edges(G, pos, edges)\n", |
| 869 | + " " |
| 870 | + ] |
| 871 | + }, |
| 872 | + { |
| 873 | + "cell_type": "code", |
| 874 | + "execution_count": null, |
| 875 | + "metadata": {}, |
| 876 | + "outputs": [], |
| 877 | + "source": [ |
| 878 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 879 | + "n_subgraphs = nx.number_connected_components(G)\n", |
| 880 | + "\n", |
| 881 | + "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n", |
| 882 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 883 | + " ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n", |
| 884 | + " ax.get_xaxis().set_visible(False)\n", |
| 885 | + " ax.get_yaxis().set_visible(False)\n", |
| 886 | + " pos = nx.spring_layout(G)\n", |
| 887 | + " nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n", |
| 888 | + " nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n" |
| 889 | + ] |
| 890 | + }, |
| 891 | + { |
| 892 | + "cell_type": "code", |
| 893 | + "execution_count": null, |
| 894 | + "metadata": {}, |
| 895 | + "outputs": [], |
| 896 | + "source": [ |
| 897 | + "#from sklearn.metrics import silhouette_score\n", |
| 898 | + "import numpy as np\n", |
| 899 | + "\n", |
| 900 | + "def compute_silhouette(threshold, friends):\n", |
| 901 | + " G = create_graph(friends, threshold=threshold)\n", |
| 902 | + " if len(G.nodes()) == 0:\n", |
| 903 | + " return -99 # Invalid graph\n", |
| 904 | + " sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 905 | + " if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n", |
| 906 | + " return -99 # Invalid number of components, Silhouette not defined\n", |
| 907 | + " label_dict = {}\n", |
| 908 | + " for i, sub_graph in enumerate(sub_graphs):\n", |
| 909 | + " for node in sub_graph.nodes():\n", |
| 910 | + " label_dict[node] = i\n", |
| 911 | + " labels = np.array([label_dict[node] for node in G.nodes()])\n", |
| 912 | + " X = nx.to_scipy_sparse_matrix(G).todense()\n", |
| 913 | + " X = 1 - X\n", |
| 914 | + " return silhouette_score(X, labels, metric='precomputed')\n", |
| 915 | + "\n", |
| 916 | + "\n", |
| 917 | + "print(compute_silhouette(0.1, friends))\n" |
| 918 | + ] |
| 919 | + }, |
| 920 | + { |
| 921 | + "cell_type": "code", |
| 922 | + "execution_count": null, |
| 923 | + "metadata": {}, |
| 924 | + "outputs": [], |
| 925 | + "source": [ |
| 926 | + "from scipy.optimize import minimize #(fun, x0, args=(),\n", |
| 927 | + "\n", |
| 928 | + "def invert(func):\n", |
| 929 | + " def inverted_function(*args, **kwds):\n", |
| 930 | + " return -func(*args, **kwds)\n", |
| 931 | + " return inverted_function\n", |
| 932 | + "\n", |
| 933 | + "result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })\n", |
| 934 | + "print(result)" |
| 935 | + ] |
| 936 | + }, |
| 937 | + { |
| 938 | + "cell_type": "code", |
| 939 | + "execution_count": null, |
| 940 | + "metadata": {}, |
| 941 | + "outputs": [], |
| 942 | + "source": [ |
| 943 | + "G = create_graph(friends, threshold=0.135)\n", |
| 944 | + "sub_graphs = nx.connected_component_subgraphs(G)\n", |
| 945 | + "\n", |
| 946 | + "for i, sub_graph in enumerate(sub_graphs):\n", |
| 947 | + " n_nodes = len(sub_graph.nodes())\n", |
| 948 | + " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" |
| 949 | + ] |
| 950 | + }, |
| 951 | + { |
| 952 | + "cell_type": "code", |
| 953 | + "execution_count": null, |
| 954 | + "metadata": {}, |
| 955 | + "outputs": [], |
| 956 | + "source": [ |
| 957 | + "labels" |
| 958 | + ] |
| 959 | + }, |
| 960 | + { |
| 961 | + "cell_type": "code", |
| 962 | + "execution_count": null, |
| 963 | + "metadata": {}, |
| 964 | + "outputs": [], |
| 965 | + "source": [ |
| 966 | + "X = 1-nx.to_scipy_sparse_matrix(G).todense()" |
| 967 | + ] |
| 968 | + }, |
| 969 | + { |
| 970 | + "cell_type": "code", |
| 971 | + "execution_count": null, |
| 972 | + "metadata": {}, |
| 973 | + "outputs": [], |
| 974 | + "source": [ |
| 975 | + "def silhouette_score(X, labels, metric='precomputed'):\n", |
| 976 | + " labels = np.array(labels)\n", |
| 977 | + " print(labels.shape)\n", |
| 978 | + " return np.mean(silhouette_samples(X, labels, metric=metric))\n", |
| 979 | + "\n", |
| 980 | + "def silhouette_samples(X, labels, metric='precomputed'):\n", |
| 981 | + " print(X.shape)\n", |
| 982 | + " distances = X #pairwise_distances(X, metric=metric, **kwds)\n", |
| 983 | + " n = labels.shape[0]\n", |
| 984 | + " A = np.array([_intra_cluster_distance(distances[i], labels, i)\n", |
| 985 | + " for i in range(n)])\n", |
| 986 | + " B = np.array([_nearest_cluster_distance(distances[i], labels, i)\n", |
| 987 | + " for i in range(n)])\n", |
| 988 | + " sil_samples = (B - A) / np.maximum(A, B)\n", |
| 989 | + " # nan values are for clusters of size 1, and should be 0\n", |
| 990 | + " return np.nan_to_num(sil_samples)\n", |
| 991 | + "\n", |
| 992 | + "def _intra_cluster_distance(distances_row, labels, i):\n", |
| 993 | + " \"\"\"Calculate the mean intra-cluster distance for sample i.\n", |
| 994 | + "\n", |
| 995 | + " Parameters\n", |
| 996 | + " ----------\n", |
| 997 | + " distances_row : array, shape = [n_samples]\n", |
| 998 | + " Pairwise distance matrix between sample i and each sample.\n", |
| 999 | + "\n", |
| 1000 | + " labels : array, shape = [n_samples]\n", |
| 1001 | + " label values for each sample\n", |
| 1002 | + "\n", |
| 1003 | + " i : int\n", |
| 1004 | + " Sample index being calculated. It is excluded from calculation and\n", |
| 1005 | + " used to determine the current label\n", |
| 1006 | + "\n", |
| 1007 | + " Returns\n", |
| 1008 | + " -------\n", |
| 1009 | + " a : float\n", |
| 1010 | + " Mean intra-cluster distance for sample i\n", |
| 1011 | + " \"\"\"\n", |
| 1012 | + " mask = (labels == labels[i])\n", |
| 1013 | + " mask[i] = False\n", |
| 1014 | + " mask = mask.reshape(distances_row.shape)\n", |
| 1015 | + " #print(\"Cluster {}\".format(i))\n", |
| 1016 | + " #print(mask)\n", |
| 1017 | + " #print(distances_row.flatten())\n", |
| 1018 | + " #print(distances_row.flatten()[mask])\n", |
| 1019 | + " a = np.mean(distances_row[mask])\n", |
| 1020 | + " return a\n", |
| 1021 | + "\n", |
| 1022 | + "\n", |
| 1023 | + "def _nearest_cluster_distance(distances_row, labels, i):\n", |
| 1024 | + " \"\"\"Calculate the mean nearest-cluster distance for sample i.\n", |
| 1025 | + "\n", |
| 1026 | + " Parameters\n", |
| 1027 | + " ----------\n", |
| 1028 | + " distances_row : array, shape = [n_samples]\n", |
| 1029 | + " Pairwise distance matrix between sample i and each sample.\n", |
| 1030 | + "\n", |
| 1031 | + " labels : array, shape = [n_samples]\n", |
| 1032 | + " label values for each sample\n", |
| 1033 | + "\n", |
| 1034 | + " i : int\n", |
| 1035 | + " Sample index being calculated. It is used to determine the current\n", |
| 1036 | + " label.\n", |
| 1037 | + "\n", |
| 1038 | + " Returns\n", |
| 1039 | + " -------\n", |
| 1040 | + " b : float\n", |
| 1041 | + " Mean nearest-cluster distance for sample i\n", |
| 1042 | + " \"\"\"\n", |
| 1043 | + " label = labels[i]\n", |
| 1044 | + " b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])\n", |
| 1045 | + " for cur_label in set(labels) if not cur_label == label])\n", |
| 1046 | + " return b" |
| 1047 | + ] |
| 1048 | + }, |
| 1049 | + { |
| 1050 | + "cell_type": "code", |
| 1051 | + "execution_count": null, |
| 1052 | + "metadata": {}, |
| 1053 | + "outputs": [], |
| 1054 | + "source": [ |
| 1055 | + "silhouette_score(X, labels, metric='precomputed')" |
| 1056 | + ] |
| 1057 | + }, |
| 1058 | + { |
| 1059 | + "cell_type": "code", |
| 1060 | + "execution_count": null, |
| 1061 | + "metadata": {}, |
750 | 1062 | "outputs": [],
|
751 | 1063 | "source": []
|
752 | 1064 | }
|
|
0 commit comments