List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {//from w w w . ja v a2 s . c om TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*/*from w w w. j a v a 2s . c om*/ String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {//from ww w. ja v a 2s . c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*//w ww.j a v a2s . co m String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sindice.siren.analysis.filter.TestURINormalisationFilter.java
License:Apache License
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID, final int[] expectedCellID) throws Exception { assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class)); final TermAttribute termAtt = t.getAttribute(TermAttribute.class); TypeAttribute typeAtt = null;//from w w w . j a va 2 s . c om if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } TupleAttribute tupleAtt = null; if (expectedTupleID != null) { assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class)); tupleAtt = t.getAttribute(TupleAttribute.class); } CellAttribute cellAtt = null; if (expectedCellID != null) { assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class)); cellAtt = t.getAttribute(CellAttribute.class); } t.reset(new StringReader(input)); final TokenStream filter = new URINormalisationFilter(t); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.term()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedTupleID != null) { assertEquals(expectedTupleID[i], tupleAtt.tuple()); } if (expectedCellID != null) { assertEquals(expectedCellID[i], cellAtt.cell()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); }
From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java
License:Apache License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID, final int[] expectedCellID) throws Exception { final TokenStream t = a.reusableTokenStream("", new StringReader(input)); assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class)); final TermAttribute termAtt = t.getAttribute(TermAttribute.class); TypeAttribute typeAtt = null;/* w ww .ja v a2 s . c om*/ if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } TupleAttribute tupleAtt = null; if (expectedTupleID != null) { assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class)); tupleAtt = t.getAttribute(TupleAttribute.class); } CellAttribute cellAtt = null; if (expectedCellID != null) { assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class)); cellAtt = t.getAttribute(CellAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals(expectedImages[i], termAtt.term()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedTupleID != null) { assertEquals(expectedTupleID[i], tupleAtt.tuple()); } if (expectedCellID != null) { assertEquals(expectedCellID[i], cellAtt.cell()); } } assertFalse("end of stream", t.incrementToken()); t.end(); t.close(); }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, s); CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); ts.reset();/* ww w. j a va 2s. c o m*/ while (ts.incrementToken()) { set.add(cattr.toString()); } ts.end(); ts.close(); }
From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java
License:Apache License
private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException { //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); stream.reset();/*from w ww . j ava 2 s. c o m*/ int defaultInc = 1; CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); OffsetAttribute offsetAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); PositionIncrementAttribute incAtt = null; if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { incAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); } while (stream.incrementToken()) { //Do we need this? if (incAtt != null && incAtt.getPositionIncrement() == 0) { continue; } currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; if (requests.contains(currInd)) { results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase, termAtt.toString()); } if (currInd > requests.getLast()) { // TODO: Is there a way to avoid this? Or, is this // an imaginary performance hit? while (stream.incrementToken()) { //NO-OP } stream.end(); stream.close(); return GOT_ALL_REQUESTS; } } stream.end(); stream.close(); return currInd; }
From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java
License:Apache License
/** * allows reuse of terms, this method calls terms.clear() before adding new * terms//from w ww . jav a 2 s. co m * * @param s string to analyze * @param field to use in analysis * @param analyzer analyzer * @param terms list for reuse * @return list of strings * @throws java.io.IOException if there's an IOException during analysis */ public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms) throws IOException { if (terms == null) { terms = new ArrayList<>(); } terms.clear(); TokenStream stream = analyzer.tokenStream(field, s); stream.reset(); CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); while (stream.incrementToken()) { terms.add(termAtt.toString()); } stream.end(); stream.close(); return terms; }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testBasicNoUnigrams() throws Exception { Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s); tokenStream.reset();/* w ww . j a va 2 s. c o m*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", }); List<String> returned = new ArrayList<>(); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); assertEquals(1, posIncAttribute.getPositionIncrement()); returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }