List of usage examples for org.apache.lucene.analysis TokenStream hasAttribute
public final boolean hasAttribute(Class<? extends Attribute> attClass)
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*//from ww w . j a v a2s. co m String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {/*from www. ja v a 2 s .c o m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*//from w w w .j a v a2 s. c o m String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
/** * Split the string into tokens using the given analyzer. *///from ww w .j a va 2 s . c o m public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) { if (string == null) return null; final List<String> result = new ArrayList<String>(); if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); try { while (tokenStream.incrementToken()) { if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); result.add(termAttribute.term()); } } tokenStream.close(); } catch (IOException e) { throw new IllegalStateException(e); } } else { result.add(string); } return result; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) { if (string == null) return null; final List<List<String>> result = new LinkedList<List<String>>(); List<String> curPhrase = new ArrayList<String>(); result.add(curPhrase);//from w w w.j av a2 s . co m if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); int lastEndOffset = 0; try { while (tokenStream.incrementToken()) { boolean incPhrase = true; if (tokenStream.hasAttribute(OffsetAttribute.class)) { final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream .getAttribute(OffsetAttribute.class); if (offsetAttribute.startOffset() == lastEndOffset) { incPhrase = false; } lastEndOffset = offsetAttribute.endOffset(); } if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); if (incPhrase && curPhrase.size() > 0) { curPhrase = new ArrayList<String>(); result.add(curPhrase); } curPhrase.add(termAttribute.term()); } } tokenStream.close(); } catch (IOException e) { throw new IllegalStateException(e); } } else { curPhrase.add(string); } return result; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
/** * Build a phrase query from the tokens in the given string using the given * analyzer./*from ww w .j a v a2 s . co m*/ * <p> * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD * for fuzzy matches. */ public static final Query toQuery(Analyzer analyzer, String fieldName, String string, Collection<String> termCollector, BooleanClause.Occur occur) { Query result = null; if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); BooleanQuery booleanQuery = null; PhraseQuery phraseQuery = null; int lastEndOffset = 0; try { while (tokenStream.incrementToken()) { if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); final String term = termAttribute.term(); // check offset attribute if (tokenStream.hasAttribute(OffsetAttribute.class)) { final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream .getAttribute(OffsetAttribute.class); if (offsetAttribute.startOffset() != lastEndOffset) { // time to increment phrase if (phraseQuery != null) { if (booleanQuery == null) booleanQuery = new BooleanQuery(); booleanQuery.add(phraseQuery, occur); phraseQuery = null; } } lastEndOffset = offsetAttribute.endOffset(); } if (phraseQuery == null) phraseQuery = new PhraseQuery(); phraseQuery.add(new Term(fieldName, term)); if (termCollector != null) termCollector.add(term); } } } catch (IOException e) { throw new IllegalStateException(e); } if (phraseQuery != null) { if (booleanQuery == null) booleanQuery = new BooleanQuery(); booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD); } result = booleanQuery; } if (result == null) { result = new TermQuery(new Term(fieldName, string)); if (termCollector != null) termCollector.add(string); } return result; }
From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java
License:Apache License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID, final int[] expectedCellID) throws Exception { final TokenStream t = a.reusableTokenStream("", new StringReader(input)); assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class)); final TermAttribute termAtt = t.getAttribute(TermAttribute.class); TypeAttribute typeAtt = null;// ww w. j a va2s . c o m if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } TupleAttribute tupleAtt = null; if (expectedTupleID != null) { assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class)); tupleAtt = t.getAttribute(TupleAttribute.class); } CellAttribute cellAtt = null; if (expectedCellID != null) { assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class)); cellAtt = t.getAttribute(CellAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals(expectedImages[i], termAtt.term()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedTupleID != null) { assertEquals(expectedTupleID[i], tupleAtt.tuple()); } if (expectedCellID != null) { assertEquals(expectedCellID[i], cellAtt.cell()); } } assertFalse("end of stream", t.incrementToken()); t.end(); t.close(); }
From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java
License:Apache License
private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException { //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); stream.reset();/*ww w . j a v a2 s .c om*/ int defaultInc = 1; CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); OffsetAttribute offsetAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); PositionIncrementAttribute incAtt = null; if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { incAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); } while (stream.incrementToken()) { //Do we need this? if (incAtt != null && incAtt.getPositionIncrement() == 0) { continue; } currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; if (requests.contains(currInd)) { results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase, termAtt.toString()); } if (currInd > requests.getLast()) { // TODO: Is there a way to avoid this? Or, is this // an imaginary performance hit? while (stream.incrementToken()) { //NO-OP } stream.end(); stream.close(); return GOT_ALL_REQUESTS; } } stream.end(); stream.close(); return currInd; }
From source file:perf.TestAnalyzerPerf.java
License:Apache License
private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount) throws Exception { System.out.println("\nTEST: " + desc); // 64 KB buffer InputStream is = new FileInputStream(wikiLinesFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); long startTime = System.currentTimeMillis(); long sumTime = 0; long hash = 0; long tokenCount = 0; int totCount = warmupCount + runCount; for (int i = 0; i < totCount; i++) { boolean isWarmup = i < warmupCount; if (i % 10000 == 0) { System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...", (System.currentTimeMillis() - startTime) / 1000.0, i)); }/* w w w .j ava2s.com*/ String s = reader.readLine(); long t0 = System.nanoTime(); TokenStream ts = a.tokenStream("field", new StringReader(s)); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt; if (ts.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); } else { posIncAtt = null; } OffsetAttribute offsetAtt; if (ts.hasAttribute(OffsetAttribute.class)) { offsetAtt = ts.getAttribute(OffsetAttribute.class); } else { offsetAtt = null; } while (ts.incrementToken()) { hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length()); if (posIncAtt != null) { hash += 31 * posIncAtt.getPositionIncrement(); } if (offsetAtt != null) { hash += 31 * offsetAtt.startOffset(); hash += 31 * offsetAtt.endOffset(); } if (isWarmup == false) { tokenCount++; } } ts.end(); ts.close(); if (isWarmup == false) { sumTime += System.nanoTime() - t0; } } reader.close(); System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc, (sumTime / 1000000.0), hash, tokenCount)); }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java
License:Mozilla Public License
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException { assertNotNull(output);/*from w w w . ja v a2 s . c o m*/ CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); CharTermAttribute termAtt = null; if (output.length > 0) { assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); termAtt = ts.getAttribute(CharTermAttribute.class); } OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } PositionLengthAttribute posLengthAtt = null; if (posLengths != null) { assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); } KeywordAttribute keywordAtt = null; if (keywordAtts != null) { assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class)); keywordAtt = ts.getAttribute(KeywordAttribute.class); } // Maps position to the start/end offset: final Map<Integer, Integer> posToStartOffset = new HashMap<>(); final Map<Integer, Integer> posToEndOffset = new HashMap<>(); ts.reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also // assign bogus values ts.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); if (keywordAtt != null) keywordAtt.setKeyword((i & 1) == 0); checkClearAtt.getAndResetClearCalled(); // reset it, because we // called clearAttribute() // before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.toString()); if (startOffsets != null) { assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); } if (endOffsets != null) { assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); } if (types != null) { assertEquals("type " + i, types[i], typeAtt.type()); } if (posIncrements != null) { assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } if (posLengths != null) { assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength()); } if (keywordAtts != null) { assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); } // we can enforce some basic things about a few attributes even if // the caller doesn't check: if (offsetAtt != null) { final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); if (finalOffset != null) { assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), endOffset <= finalOffset.intValue()); } if (offsetsAreCorrect) { assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); lastStartOffset = offsetAtt.startOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: final int posInc = posIncrAtt.getPositionIncrement(); pos += posInc; final int posLength = posLengthAtt.getPositionLength(); if (!posToStartOffset.containsKey(pos)) { // First time we've seen a token leaving from this // position: posToStartOffset.put(pos, startOffset); // System.out.println(" + s " + pos + " -> " + // startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: // System.out.println(" + vs " + pos + " -> " + // startOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); } final int endPos = pos + posLength; if (!posToEndOffset.containsKey(endPos)) { // First time we've seen a token arriving to this // position: posToEndOffset.put(endPos, endOffset); // System.out.println(" + e " + endPos + " -> " + // endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: // System.out.println(" + ve " + endPos + " -> " + // endOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); } } } if (posIncrAtt != null) { if (i == 0) { assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); } else { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } } if (posLengthAtt != null) { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } if (ts.incrementToken()) { fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); } // repeat our extra safety checks for end() ts.clearAttributes(); if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); checkClearAtt.getAndResetClearCalled(); // reset it, because we called // clearAttribute() before ts.end(); assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); if (finalOffset != null) { assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } if (finalPosInc != null) { assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); } ts.close(); }