25
25
import org .apache .lucene .analysis .TokenStream ;
26
26
import org .apache .lucene .analysis .Tokenizer ;
27
27
import org .apache .lucene .analysis .WhitespaceTokenizer ;
28
+ import org .apache .lucene .analysis .synonym .SolrSynonymParser ;
28
29
import org .apache .lucene .analysis .synonym .SynonymFilter ;
29
30
import org .apache .lucene .analysis .synonym .SynonymMap ;
31
+ import org .apache .lucene .analysis .synonym .WordnetSynonymParser ;
30
32
import org .apache .lucene .util .CharsRef ;
31
33
import org .elasticsearch .ElasticSearchIllegalArgumentException ;
32
34
import org .elasticsearch .common .inject .Inject ;
@@ -57,8 +59,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
57
59
@ Assisted String name , @ Assisted Settings settings ) {
58
60
super (index , indexSettings , name , settings );
59
61
60
- List < String > rules = Analysis .getWordList (env , settings , "synonyms" );
61
- if (rules == null ) {
62
+ Reader rulesReader = Analysis .getFileReader (env , settings , "synonyms" );
63
+ if (rulesReader == null ) {
62
64
throw new ElasticSearchIllegalArgumentException ("synonym requires either `synonyms` or `synonyms_path` to be configured" );
63
65
}
64
66
this .ignoreCase = settings .getAsBoolean ("ignore_case" , false );
@@ -71,7 +73,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
71
73
tokenizerFactoryFactory = indicesAnalysisService .tokenizerFactoryFactory (tokenizerName );
72
74
}
73
75
if (tokenizerFactoryFactory == null ) {
74
- throw new ElasticSearchIllegalArgumentException ("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter" );
76
+ throw new ElasticSearchIllegalArgumentException ("failed to find tokenizer [" + tokenizerName + "] for synonym token filter" );
75
77
}
76
78
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory .create (tokenizerName , settings );
77
79
@@ -84,13 +86,18 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
84
86
}
85
87
};
86
88
87
- CustomSynonymParser parser = new CustomSynonymParser (true , expand , analyzer );
88
89
try {
89
- for (String rule : rules ) {
90
- parser .addLine (rule );
90
+ SynonymMap .Builder parser = null ;
91
+
92
+ if (settings .get ("format" ,"wordnet" ).equalsIgnoreCase ("wordnet" )) {
93
+ parser = new WordnetSynonymParser (true , expand , analyzer );
94
+ ((WordnetSynonymParser )parser ).add (rulesReader );
95
+ } else {
96
+ parser = new SolrSynonymParser (true , expand , analyzer );
97
+ ((SolrSynonymParser )parser ).add (rulesReader );
91
98
}
92
99
synonymMap = parser .build ();
93
- } catch (IOException e ) {
100
+ } catch (Exception e ) {
94
101
throw new ElasticSearchIllegalArgumentException ("failed to build synonyms" , e );
95
102
}
96
103
}
@@ -99,160 +106,4 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
99
106
// fst is null means no synonyms
100
107
return synonymMap .fst == null ? tokenStream : new SynonymFilter (tokenStream , synonymMap , ignoreCase );
101
108
}
102
-
103
- /**
104
- * Parser for the Solr synonyms format.
105
- * <ol>
106
- * <li> Blank lines and lines starting with '#' are comments.
107
- * <li> Explicit mappings match any token sequence on the LHS of "=>"
108
- * and replace with all alternatives on the RHS. These types of mappings
109
- * ignore the expand parameter in the constructor.
110
- * Example:
111
- * <blockquote>i-pod, i pod => ipod</blockquote>
112
- * <li> Equivalent synonyms may be separated with commas and give
113
- * no explicit mapping. In this case the mapping behavior will
114
- * be taken from the expand parameter in the constructor. This allows
115
- * the same synonym file to be used in different synonym handling strategies.
116
- * Example:
117
- * <blockquote>ipod, i-pod, i pod</blockquote>
118
- *
119
- * <li> Multiple synonym mapping entries are merged.
120
- * Example:
121
- * <blockquote>
122
- * foo => foo bar<br>
123
- * foo => baz<br><br>
124
- * is equivalent to<br><br>
125
- * foo => foo bar, baz
126
- * </blockquote>
127
- * </ol>
128
- *
129
- * @lucene.experimental
130
- */
131
- public static class CustomSynonymParser extends SynonymMap .Builder {
132
- private final boolean expand ;
133
- private final Analyzer analyzer ;
134
-
135
- public CustomSynonymParser (boolean dedup , boolean expand , Analyzer analyzer ) {
136
- super (dedup );
137
- this .expand = expand ;
138
- this .analyzer = analyzer ;
139
- }
140
-
141
- public void add (Reader in ) throws IOException , ParseException {
142
- LineNumberReader br = new LineNumberReader (in );
143
- try {
144
- addInternal (br );
145
- } catch (IllegalArgumentException e ) {
146
- ParseException ex = new ParseException ("Invalid synonym rule at line " + br .getLineNumber (), 0 );
147
- ex .initCause (e );
148
- throw ex ;
149
- } finally {
150
- br .close ();
151
- }
152
- }
153
-
154
- public void addLine (String line ) throws IOException {
155
- if (line .length () == 0 || line .charAt (0 ) == '#' ) {
156
- return ;
157
- }
158
-
159
- CharsRef inputs [];
160
- CharsRef outputs [];
161
-
162
- // TODO: we could process this more efficiently.
163
- String sides [] = split (line , "=>" );
164
- if (sides .length > 1 ) { // explicit mapping
165
- if (sides .length != 2 ) {
166
- throw new IllegalArgumentException ("more than one explicit mapping specified on the same line" );
167
- }
168
- String inputStrings [] = split (sides [0 ], "," );
169
- inputs = new CharsRef [inputStrings .length ];
170
- for (int i = 0 ; i < inputs .length ; i ++) {
171
- inputs [i ] = analyze (analyzer , unescape (inputStrings [i ]).trim (), new CharsRef ());
172
- }
173
-
174
- String outputStrings [] = split (sides [1 ], "," );
175
- outputs = new CharsRef [outputStrings .length ];
176
- for (int i = 0 ; i < outputs .length ; i ++) {
177
- outputs [i ] = analyze (analyzer , unescape (outputStrings [i ]).trim (), new CharsRef ());
178
- }
179
- } else {
180
- String inputStrings [] = split (line , "," );
181
- inputs = new CharsRef [inputStrings .length ];
182
- for (int i = 0 ; i < inputs .length ; i ++) {
183
- inputs [i ] = analyze (analyzer , unescape (inputStrings [i ]).trim (), new CharsRef ());
184
- }
185
- if (expand ) {
186
- outputs = inputs ;
187
- } else {
188
- outputs = new CharsRef [1 ];
189
- outputs [0 ] = inputs [0 ];
190
- }
191
- }
192
-
193
- // currently we include the term itself in the map,
194
- // and use includeOrig = false always.
195
- // this is how the existing filter does it, but its actually a bug,
196
- // especially if combined with ignoreCase = true
197
- for (int i = 0 ; i < inputs .length ; i ++) {
198
- for (int j = 0 ; j < outputs .length ; j ++) {
199
- add (inputs [i ], outputs [j ], false );
200
- }
201
- }
202
- }
203
-
204
- private void addInternal (BufferedReader in ) throws IOException {
205
- String line = null ;
206
- while ((line = in .readLine ()) != null ) {
207
- addLine (line );
208
- }
209
- }
210
-
211
- private static String [] split (String s , String separator ) {
212
- ArrayList <String > list = new ArrayList <String >(2 );
213
- StringBuilder sb = new StringBuilder ();
214
- int pos = 0 , end = s .length ();
215
- while (pos < end ) {
216
- if (s .startsWith (separator , pos )) {
217
- if (sb .length () > 0 ) {
218
- list .add (sb .toString ());
219
- sb = new StringBuilder ();
220
- }
221
- pos += separator .length ();
222
- continue ;
223
- }
224
-
225
- char ch = s .charAt (pos ++);
226
- if (ch == '\\' ) {
227
- sb .append (ch );
228
- if (pos >= end ) break ; // ERROR, or let it go?
229
- ch = s .charAt (pos ++);
230
- }
231
-
232
- sb .append (ch );
233
- }
234
-
235
- if (sb .length () > 0 ) {
236
- list .add (sb .toString ());
237
- }
238
-
239
- return list .toArray (new String [list .size ()]);
240
- }
241
-
242
- private String unescape (String s ) {
243
- if (s .indexOf ("\\ " ) >= 0 ) {
244
- StringBuilder sb = new StringBuilder ();
245
- for (int i = 0 ; i < s .length (); i ++) {
246
- char ch = s .charAt (i );
247
- if (ch == '\\' && i < s .length () - 1 ) {
248
- sb .append (s .charAt (++i ));
249
- } else {
250
- sb .append (ch );
251
- }
252
- }
253
- return sb .toString ();
254
- }
255
- return s ;
256
- }
257
- }
258
109
}
0 commit comments