nowcoder001
diff --git a/‎.gitignore
Lines changed: 33 additions & 0 deletions b/‎.gitignore
Lines changed: 33 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎pom.xml
Lines changed: 82 additions & 0 deletions b/‎pom.xml
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/main/java/Abstract/BM25.java
Lines changed: 157 additions & 0 deletions b/‎src/main/java/Abstract/BM25.java
Lines changed: 157 additions & 0 deletions
diff --git a/‎src/main/java/Abstract/ExtractAbstract.java
Lines changed: 23 additions & 0 deletions b/‎src/main/java/Abstract/ExtractAbstract.java
Lines changed: 23 additions & 0 deletions
@@ -0,0 +1,33 @@
+HELP.md
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**
+!**/src/test/**
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+
+### VS Code ###
+.vscode/
+.DS_Store
+src/main/resources/ml-25m
@@ -0,0 +1 @@
+# Recommended-and-datacleaning-Algorithm
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>8</source>
+                    <target>8</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+    <dependencies>
+        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
+
+        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
+        <dependency>
+            <groupId>com.belerweb</groupId>
+            <artifactId>pinyin4j</artifactId>
+            <version>2.5.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.janeluo</groupId>
+            <artifactId>ikanalyzer</artifactId>
+            <version>2012_u6</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.11.2</version>
+        </dependency>
+        <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
+        <dependency>
+            <groupId>mysql</groupId>
+            <artifactId>mysql-connector-java</artifactId>
+            <version>8.0.20</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.mahout</groupId>
+            <artifactId>mahout</artifactId>
+            <version>0.11.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.mahout</groupId>
+            <artifactId>mahout-examples</artifactId>
+            <version>0.13.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.opencsv</groupId>
+            <artifactId>opencsv</artifactId>
+            <version>4.4</version>
+        </dependency>
+        <!-- https://mvnrepository.com/artifact/net.sourceforge.javacsv/javacsv -->
+        <dependency>
+            <groupId>net.sourceforge.javacsv</groupId>
+            <artifactId>javacsv</artifactId>
+            <version>2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.alibaba</groupId>
+            <artifactId>fastjson</artifactId>
+            <version>1.2.5</version>
+        </dependency>
+        <!-- 分词器 （可以替换为其他中文分词器）-->
+        <dependency>
+            <groupId>com.hankcs</groupId>
+            <artifactId>hanlp</artifactId>
+            <version>portable-1.3.4</version>
+        </dependency>
+    </dependencies>
+    <groupId>DataMining</groupId>
+    <artifactId>RecommendedAlgorithm</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+
+</project>
@@ -0,0 +1,157 @@
+package Abstract;
+
+/**
+ *  搜索相关性评分算法
+ *  评价搜索词和文档之间相关性的算法
+ *  它是一种基于概率检索模型提出的算法
+ *  https://www.jianshu.com/p/b4f06594d32f
+ *
+ *  单词和D之间的相关性
+ * 单词和query之间的相关性
+ * 每个单词的权重
+ * 最后对于每个单词的分数我们做一个求和，就得到了query和文档之间的分数。
+ */
+
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class BM25 {
+    /**
+     * 文档句子的个数
+     */
+    int D;
+
+    /**
+     * 文档句子的平均长度
+     */
+    double avgdl;
+
+    /**
+     * 拆分为[句子[单词]]形式的文档
+     */
+    List<List<String>> docs;
+
+    /**
+     * 文档中每个句子中的每个词与词频
+     */
+    Map<String, Integer>[] f;
+
+    /**
+     * 文档中全部词语与出现在几个句子中
+     */
+    Map<String, Integer> df;
+
+    /**
+     * IDF
+     */
+    Map<String, Double> idf;
+
+    /**
+     * 调节因子
+     */
+    final static float k1 = 1.5f;
+
+    /**
+     * 调节因子
+     */
+    final static float b = 0.75f;
+    public BM25(List<List<String>> docs)
+    {
+        this.docs = docs;
+        D = docs.size();
+        //计算文档中句子的平均长度  总词数/句子总数
+        for (List<String> sentence : docs)
+        {
+            avgdl += sentence.size();
+        }
+        avgdl /= D;
+        f = new Map[D];
+        df = new TreeMap<String, Integer>();
+        idf = new TreeMap<String, Double>();
+        init();
+    }
+
+    /**
+     * 在构造时初始化自己的所有参数
+     *
+     */
+    private void init()
+    {
+        int index = 0;//index表示现在是文档中的第几句话
+
+        for (List<String> sentence : docs)
+        {
+            //对于每个句子的分词结果  计算 分词在这个句子中出现的频率 为tf   存成String int的映射形式
+            Map<String, Integer> tf = new TreeMap<String, Integer>();
+            for (String word : sentence)
+
+            {
+                //计算每个值出现的频数  作为tf
+                Integer freq = tf.get(word);
+                freq = (freq == null ? 0 : freq) + 1;
+                tf.put(word, freq);
+            }
+            f[index] = tf;//存储每句话对应的tf值Map
+
+            //根据tf值算df值  计算每个词出现在几个句子中
+            for (Map.Entry<String, Integer> entry : tf.entrySet())
+            {
+
+                String word = entry.getKey();
+                Integer freq = df.get(word);
+                freq = (freq == null ? 0 : freq) + 1;
+                df.put(word, freq);
+            }
+            ++index;
+        }
+        //根据df计算idf   公司为log(D - freq + 0.5) - Math.log(freq + 0.5)  D为文档中句子个数  0.5为平滑项
+        for (Map.Entry<String, Integer> entry : df.entrySet())
+        {
+            //计算逆文档频率 idf
+            String word = entry.getKey();
+            Integer freq = entry.getValue();
+            idf.put(word, Math.log(D - freq + 0.5) - Math.log(freq + 0.5));
+        }
+    }
+
+    /**
+     * 计算相似度 最终得到一个句子 与对应index句子的相关性得分
+     * @param sentence
+     * @param index
+     * @return
+     */
+    public double sim(List<String> sentence, int index)
+    {
+        double score = 0;
+        //对于一句话中的每一个单词  计算这个单词  与其他句子的相关性得分 这个得分用BM25计算出
+        for (String word : sentence)
+        {
+            if (!f[index].containsKey(word)) continue;
+            int d = docs.get(index).size();//index对应句子的词的个数
+            Integer wf = f[index].get(word);//在index对应句子中 词word出现的次数
+            //，参数b的作用是调整文档长度对相关性影响的大小。b越大，文档长度的对相关性得分的影响越大，反之越小。而文档的相对长度越长，K值将越大，则相关性得
+            //分会越小。这可以理解为，当文档较长时，包含qi的机会越大，因此，同等fi的情况下，长文档与qi的相关性应该比短文档与qi的相关性弱。
+            score += (idf.get(word) * wf * (k1 + 1)
+                    / (wf + k1 * (1 - b + b * d
+                    / avgdl)));
+        }
+        //最终得到一个句子  与对应index句子的相关性得分
+        return score;
+    }
+
+    /**
+     * 计算整体的相似度  计算每一个句子与其他所有句子的相似度
+     * @param sentence
+     * @return
+     */
+    public double[] simAll(List<String> sentence)
+    {
+        double[] scores = new double[D];
+        for (int i = 0; i < D; ++i)
+        {
+            scores[i] = sim(sentence, i);
+        }
+        return scores;
+    }
+}
@@ -0,0 +1,23 @@
+package Abstract;
+
+import com.hankcs.hanlp.HanLP;
+
+import java.util.List;
+
+public class ExtractAbstract {
+    public static void main(String [] args)
+    {
+        String document = "四海网讯网讯，近日，有媒体报道称：章子怡真怀孕了!报道还援引知情人士消息称，" +
+                "“章子怡怀孕大概四五个月，预产期是年底前后，现在已经不接工作了。”这到底是怎么回事?消息是真是假?针对此消息，" +
+                "23日晚8时30分，华西都市报记者迅速联系上了与章子怡家里关系极好的知情人士，这位人士向华西都市报记者证实说：“子怡这次确实怀孕了。" +
+                "她已经36岁了，也该怀孕了。章子怡怀上汪峰的孩子后，子怡的父母亲十分高兴。子怡的母亲，已开始悉心照料女儿了。子怡的预产期大概是今年12月底。" +
+                "”当晚9时，华西都市报记者为了求证章子怡怀孕消息，又电话联系章子怡的亲哥哥章子男，但电话通了，一直没有人接听。有关章子怡怀孕的新闻自从2013年9月份章子怡和汪峰恋情以来，" +
+                "就被传N遍了!不过，时间跨入2015年，事情却发生着微妙的变化。2015年3月21日，章子怡担任制片人的电影《从天儿降》开机，在开机发布会上几张合影，让网友又燃起了好奇心：" +
+                "“章子怡真的怀孕了吗?”但后据证实，章子怡的“大肚照”只是影片宣传的噱头。过了四个月的7月22日，《太平轮》新一轮宣传，章子怡又被发现状态不佳，不时深呼吸，不自觉想捂住肚子" +
+                "，又觉得不妥。然后在8月的一天，章子怡和朋友吃饭，在酒店门口被风行工作室拍到了，疑似有孕在身!今年7月11日，汪峰本来在上海要举行演唱会，后来因为台风“灿鸿”取消了。" +
+                "而消息人士称，汪峰原来打算在演唱会上当着章子怡的面宣布重大消息，" +
+                "而且章子怡已经赴上海准备参加演唱会了，怎知遇到台风，只好延期，相信9月26日的演唱会应该还会有惊喜大白天下吧。";
+        List<String> sentenceList = HanLP.extractSummary(document, 3);
+        System.out.println(sentenceList);
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Recommended-and-datacleaning-Algorithm`