List of usage examples for org.apache.lucene.analysis MockTokenizer WHITESPACE
CharacterRunAutomaton WHITESPACE
To view the source code for org.apache.lucene.analysis MockTokenizer WHITESPACE.
Click Source Link
From source file:brightsolid.solr.plugins.TestTargetPositionQuerySynonyms.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); String testFile = "one, uno, un\n" + "two, dos, too\n" + "three, free, tres"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random())); parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @Override// www . ja va 2 s . c om protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer); Document doc = new Document(); FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED); newType.setOmitNorms(true); Field field = newField("field", "", newType); field.fieldType().setOmitNorms(true); doc.add(field); field.setStringValue("one two three"); iw.addDocument(doc); field.setStringValue("two three one"); iw.addDocument(doc); field.setStringValue("three one two"); iw.addDocument(doc); reader = iw.getReader(); iw.close(); searcher = newSearcher(reader); }
From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java
License:Open Source License
@Test public void testNonCJKtokens() throws Exception { TokenStream stream = new MockTokenizer(new StringReader("Des mot"), MockTokenizer.WHITESPACE, false); CJKFoldingFilter filter = new CJKFoldingFilter(stream); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); filter.reset();//from w w w . j a va 2 s. c om assertTermEquals("Des", filter, termAtt); assertTermEquals("mot", filter, termAtt); assertFalse(filter.incrementToken()); }
From source file:gov.nih.nlm.ncbi.seqr.tokenizer.TestTokenSizeFilter.java
License:Apache License
public void testTokenSizeFilter() throws IOException { TokenStream stream = new MockTokenizer(new StringReader("short too long"), MockTokenizer.WHITESPACE, false); TokenSizeFilter filter = new TokenSizeFilter(stream); assertTokenStreamContents(filter, new String[] { "3" }); }
From source file:info.freelibrary.solr.ISO639ConversionFilterTest.java
License:Apache License
/** * Does the work of the test using a {@link MockTokenizer}. * //from w w w. ja v a2 s.c om * @param aInput The ISO-639 two or three digit code * @param aExpected The expected results * @throws Exception If there is trouble tokenizing or converting */ static void assertConvertsTo(String aInput, String[] aExpected) throws Exception { Tokenizer tokenizer = new MockTokenizer(new StringReader(aInput), MockTokenizer.WHITESPACE, false); ISO639ConversionFilter filter = new ISO639ConversionFilter(tokenizer); assertTokenStreamContents(filter, aExpected); // TODO: Do we want to test with other tokenizers? Using WHITESPACE. /* * Source documentation: http://lucene.apache.org/core/3_6_2/api/test-framework/org/apache/lucene/analysis/MockTokenizer.html * http://lucene.apache.org/core/4_2_0/test-framework/org/apache/lucene/analysis/MockTokenizer.html * http://lucene.apache.org/core/4_4_0/test-framework/org/apache/lucene/analysis/MockTokenizer.html */ }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testNumericTokensFromFilter() throws Exception { String input = "N/2/6/2/10 N/8/57/5/20549 N/5/42/2/13"; TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "N", false); assertTokenStreamContents(ts, new String[] { "10", "20549", "13" }); }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testNumericTokensFromFilterWithSort() throws Exception { String input = "N/2/6/2/10 N/8/57/5/20549 N/5/42/2/13"; TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "N", true); assertTokenStreamContents(ts, new String[] { NumberUtils.double2sortableStr(10), NumberUtils.double2sortableStr(20549), NumberUtils.double2sortableStr(13) }); }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testMoneyTokensFromFilter() throws Exception { String input = "M/2/4/2/15 M/8/47/6/25.00 M/10/48/13/9.24E7"; TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "M", false); assertTokenStreamContents(ts, new String[] { "15", "25.00", "9.24E7" }); }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testPercentageTokensFromFilter() throws Exception { String input = "P/4/23/4/100 P/6/19/5/2.96 P/1/1/1/-1"; TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "P", true); assertTokenStreamContents(ts, new String[] { NumberUtils.double2sortableStr(100), NumberUtils.double2sortableStr(2.96), NumberUtils.double2sortableStr(-1) }); }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testDateTokensFromFilter() throws Exception { String input = " D/8/39/4/1934-01-01 D/4/5/17/2013-12 D/4/5/17/2013 D/4/19/8/XXXX-03-01 D/4/19/8/XXXX-03 D/4/19/8/XXXX-XX-01"; SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat df1 = new SimpleDateFormat("yyyy-MM"); SimpleDateFormat df2 = new SimpleDateFormat("yyyy"); TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "D", true); Long l1 = df.parse("1934-01-01").getTime(); Long l2 = df1.parse("2013-12").getTime(); Long l3 = df2.parse("2013").getTime(); Long l4 = df.parse("1400-03-01").getTime(); Long l5 = df1.parse("1400-03").getTime(); Long l6 = df.parse("1400-01-01").getTime(); assertTokenStreamContents(ts,// w w w . ja v a 2s. c om new String[] { NumberUtils.long2sortableStr(l1), NumberUtils.long2sortableStr(l2), NumberUtils.long2sortableStr(l3), NumberUtils.long2sortableStr(l4), NumberUtils.long2sortableStr(l5), NumberUtils.long2sortableStr(l6) }); }
From source file:org.apache.solr.analysis.entity.TestEntityFilter.java
License:Apache License
public void testRatioTokensFromFilter() throws Exception { String input = "R/5/29/5/10-00 R/22/143/5/4.5-1 R/26/123/5/12-01"; TokenStream ts = new EntityFilter( new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "R", false); assertTokenStreamContents(ts, new String[] { "10-00", "4.5-1", "12-01" }); }