Skip to content

Commit 98504dd

Browse files
lukas-vlcekkimchy
authored andcommitted
Adding Wordnet synonym format
1 parent ae66135 commit 98504dd

File tree

2 files changed

+40
-163
lines changed

2 files changed

+40
-163
lines changed

modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,4 +241,30 @@ public static List<String> loadWordList(Reader reader, String comment) throws IO
241241
}
242242
return result;
243243
}
244+
245+
/**
246+
* @return null If no settings set for "settingsPrefix + _path" then return null.
247+
*
248+
* @throws ElasticSearchIllegalArgumentException
249+
* If the Reader can not be instantiated.
250+
*/
251+
public static Reader getFileReader(Environment env, Settings settings, String settingPrefix) {
252+
String filePath = settings.get(settingPrefix + "_path", null);
253+
254+
if (filePath == null) {
255+
return null;
256+
}
257+
258+
URL fileUrl = env.resolveConfig(filePath);
259+
260+
Reader reader = null;
261+
try {
262+
reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8);
263+
} catch (IOException ioe) {
264+
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
265+
throw new ElasticSearchIllegalArgumentException(message);
266+
}
267+
268+
return reader;
269+
}
244270
}

modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java

Lines changed: 14 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525
import org.apache.lucene.analysis.TokenStream;
2626
import org.apache.lucene.analysis.Tokenizer;
2727
import org.apache.lucene.analysis.WhitespaceTokenizer;
28+
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
2829
import org.apache.lucene.analysis.synonym.SynonymFilter;
2930
import org.apache.lucene.analysis.synonym.SynonymMap;
31+
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
3032
import org.apache.lucene.util.CharsRef;
3133
import org.elasticsearch.ElasticSearchIllegalArgumentException;
3234
import org.elasticsearch.common.inject.Inject;
@@ -57,8 +59,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
5759
@Assisted String name, @Assisted Settings settings) {
5860
super(index, indexSettings, name, settings);
5961

60-
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
61-
if (rules == null) {
62+
Reader rulesReader = Analysis.getFileReader(env, settings, "synonyms");
63+
if (rulesReader == null) {
6264
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
6365
}
6466
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
@@ -71,7 +73,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
7173
tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
7274
}
7375
if (tokenizerFactoryFactory == null) {
74-
throw new ElasticSearchIllegalArgumentException("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter");
76+
throw new ElasticSearchIllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
7577
}
7678
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings);
7779

@@ -84,13 +86,18 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
8486
}
8587
};
8688

87-
CustomSynonymParser parser = new CustomSynonymParser(true, expand, analyzer);
8889
try {
89-
for (String rule : rules) {
90-
parser.addLine(rule);
90+
SynonymMap.Builder parser = null;
91+
92+
if (settings.get("format","wordnet").equalsIgnoreCase("wordnet")) {
93+
parser = new WordnetSynonymParser(true, expand, analyzer);
94+
((WordnetSynonymParser)parser).add(rulesReader);
95+
} else {
96+
parser = new SolrSynonymParser(true, expand, analyzer);
97+
((SolrSynonymParser)parser).add(rulesReader);
9198
}
9299
synonymMap = parser.build();
93-
} catch (IOException e) {
100+
} catch (Exception e) {
94101
throw new ElasticSearchIllegalArgumentException("failed to build synonyms", e);
95102
}
96103
}
@@ -99,160 +106,4 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
99106
// fst is null means no synonyms
100107
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
101108
}
102-
103-
/**
104-
* Parser for the Solr synonyms format.
105-
* <ol>
106-
* <li> Blank lines and lines starting with '#' are comments.
107-
* <li> Explicit mappings match any token sequence on the LHS of "=>"
108-
* and replace with all alternatives on the RHS. These types of mappings
109-
* ignore the expand parameter in the constructor.
110-
* Example:
111-
* <blockquote>i-pod, i pod => ipod</blockquote>
112-
* <li> Equivalent synonyms may be separated with commas and give
113-
* no explicit mapping. In this case the mapping behavior will
114-
* be taken from the expand parameter in the constructor. This allows
115-
* the same synonym file to be used in different synonym handling strategies.
116-
* Example:
117-
* <blockquote>ipod, i-pod, i pod</blockquote>
118-
*
119-
* <li> Multiple synonym mapping entries are merged.
120-
* Example:
121-
* <blockquote>
122-
* foo => foo bar<br>
123-
* foo => baz<br><br>
124-
* is equivalent to<br><br>
125-
* foo => foo bar, baz
126-
* </blockquote>
127-
* </ol>
128-
*
129-
* @lucene.experimental
130-
*/
131-
public static class CustomSynonymParser extends SynonymMap.Builder {
132-
private final boolean expand;
133-
private final Analyzer analyzer;
134-
135-
public CustomSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
136-
super(dedup);
137-
this.expand = expand;
138-
this.analyzer = analyzer;
139-
}
140-
141-
public void add(Reader in) throws IOException, ParseException {
142-
LineNumberReader br = new LineNumberReader(in);
143-
try {
144-
addInternal(br);
145-
} catch (IllegalArgumentException e) {
146-
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
147-
ex.initCause(e);
148-
throw ex;
149-
} finally {
150-
br.close();
151-
}
152-
}
153-
154-
public void addLine(String line) throws IOException {
155-
if (line.length() == 0 || line.charAt(0) == '#') {
156-
return;
157-
}
158-
159-
CharsRef inputs[];
160-
CharsRef outputs[];
161-
162-
// TODO: we could process this more efficiently.
163-
String sides[] = split(line, "=>");
164-
if (sides.length > 1) { // explicit mapping
165-
if (sides.length != 2) {
166-
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
167-
}
168-
String inputStrings[] = split(sides[0], ",");
169-
inputs = new CharsRef[inputStrings.length];
170-
for (int i = 0; i < inputs.length; i++) {
171-
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
172-
}
173-
174-
String outputStrings[] = split(sides[1], ",");
175-
outputs = new CharsRef[outputStrings.length];
176-
for (int i = 0; i < outputs.length; i++) {
177-
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
178-
}
179-
} else {
180-
String inputStrings[] = split(line, ",");
181-
inputs = new CharsRef[inputStrings.length];
182-
for (int i = 0; i < inputs.length; i++) {
183-
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
184-
}
185-
if (expand) {
186-
outputs = inputs;
187-
} else {
188-
outputs = new CharsRef[1];
189-
outputs[0] = inputs[0];
190-
}
191-
}
192-
193-
// currently we include the term itself in the map,
194-
// and use includeOrig = false always.
195-
// this is how the existing filter does it, but its actually a bug,
196-
// especially if combined with ignoreCase = true
197-
for (int i = 0; i < inputs.length; i++) {
198-
for (int j = 0; j < outputs.length; j++) {
199-
add(inputs[i], outputs[j], false);
200-
}
201-
}
202-
}
203-
204-
private void addInternal(BufferedReader in) throws IOException {
205-
String line = null;
206-
while ((line = in.readLine()) != null) {
207-
addLine(line);
208-
}
209-
}
210-
211-
private static String[] split(String s, String separator) {
212-
ArrayList<String> list = new ArrayList<String>(2);
213-
StringBuilder sb = new StringBuilder();
214-
int pos = 0, end = s.length();
215-
while (pos < end) {
216-
if (s.startsWith(separator, pos)) {
217-
if (sb.length() > 0) {
218-
list.add(sb.toString());
219-
sb = new StringBuilder();
220-
}
221-
pos += separator.length();
222-
continue;
223-
}
224-
225-
char ch = s.charAt(pos++);
226-
if (ch == '\\') {
227-
sb.append(ch);
228-
if (pos >= end) break; // ERROR, or let it go?
229-
ch = s.charAt(pos++);
230-
}
231-
232-
sb.append(ch);
233-
}
234-
235-
if (sb.length() > 0) {
236-
list.add(sb.toString());
237-
}
238-
239-
return list.toArray(new String[list.size()]);
240-
}
241-
242-
private String unescape(String s) {
243-
if (s.indexOf("\\") >= 0) {
244-
StringBuilder sb = new StringBuilder();
245-
for (int i = 0; i < s.length(); i++) {
246-
char ch = s.charAt(i);
247-
if (ch == '\\' && i < s.length() - 1) {
248-
sb.append(s.charAt(++i));
249-
} else {
250-
sb.append(ch);
251-
}
252-
}
253-
return sb.toString();
254-
}
255-
return s;
256-
}
257-
}
258109
}

0 commit comments

Comments
 (0)