Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:uib.scratch.AnalyzerUtils.java

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    TermAttribute term = stream.addAttribute(TermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B 
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    stream.reset();
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }/*from   ww  w .  ja v  a2 s  .  c o  m*/

        System.out.print("[" + // #E
                term.term() + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    System.out.println();
}

From source file:uk.co.flax.luwak.analysis.TestSuffixingNGramTokenizer.java

License:Apache License

public static void main(String... args) throws IOException {

    String text = Files.toString(new File("src/test/resources/gutenberg/README"), Charsets.UTF_8);
    DocumentBatch batch = DocumentBatch//  www. j a  va2s. c  om
            .of(InputDocument.builder("1").addField("f", text, new StandardAnalyzer()).build());

    for (int i = 0; i < 50; i++) {

        long time = System.currentTimeMillis();

        // Cannot use try-with-resources here as we assign to ts in the block.
        LeafReader reader = batch.getIndexReader();
        TokenStream ts = new TermsEnumTokenStream(reader.fields().terms("f").iterator());
        try {
            ts = new SuffixingNGramTokenFilter(ts, "XX", "__WILDCARD__", 20);
            //ts = new DuplicateRemovalTokenFilter(ts);
            int tokencount = 0;
            ts.reset();
            while (ts.incrementToken()) {
                tokencount++;
            }

            System.out.println(tokencount + " tokens in " + (System.currentTimeMillis() - time) + " ms");
        } finally {
            ts.close();
        }
    }

}

From source file:uk.co.flax.luwak.assertions.TokenStreamAssert.java

License:Apache License

protected TokenStreamAssert(TokenStream actual) throws IOException {
    super(actual, TokenStreamAssert.class);
    termAtt = actual.addAttribute(CharTermAttribute.class);
    posIncAtt = actual.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = actual.addAttribute(OffsetAttribute.class);
    actual.reset();
    actualpos = 0;/*from   w  w  w  .j  a  v a  2 s.  co m*/
}

From source file:uk.co.flax.luwak.testutils.TokenStreamUtils.java

License:Apache License

public static void dumpTokenStream(TokenStream ts) throws IOException {
    ts.reset();
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        System.out.println(termAtt.toString());
    }/*from   w ww.j a  v  a2 s .c  o  m*/
}

From source file:uk.co.nickthecoder.pinkwino.metadata.LuceneMetaData.java

License:Open Source License

public String analyzeWord(String word) {
    Reader reader = new StringReader(word);
    TokenStream tokenStream = null;
    try {/*www. ja v  a2  s  .  co m*/
        tokenStream = _analyzer.tokenStream("content", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            return charTermAttribute.toString();
        }
    } catch (Exception e) {
        _logger.error("Failed to filter a keyword. " + e);
    } finally {
        try {
            if (tokenStream != null) {
                tokenStream.end();
                tokenStream.close();
            }
            reader.close();
        } catch (Exception e) {
            // Do nothing
            _logger.error("Failed to close during analyzeWord " + e);
        }
    }
    return null;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);/* w w w  .  ja v a2s  .c o  m*/
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}

From source file:varaha.text.TokenizeText.java

License:Apache License

/**
   Fills a DataBag with tokens from a TokenStream
 *///from  w w  w.j  a v  a  2s . co  m
public DataBag fillBag(TokenStream stream) throws IOException {
    DataBag result = bagFactory.newDefaultBag();
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    try {
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttribute.length() > 0) {
                Tuple termText = tupleFactory.newTuple(termAttribute.toString());
                result.add(termText);
            }
        }
        stream.end();
    } finally {
        stream.close();
    }
    return result;
}

From source file:webdocs.WebDocAnalyzer.java

String preprocessText(String html, boolean title) throws IOException {

    int freqCutoffThreshold = title ? 1 : this.freqCutoffThreshold;

    HashMap<String, Integer> tfMap = new HashMap<>();

    StringBuffer buff = new StringBuffer();
    CharArraySet stopList = StopFilter.makeStopSet(Version.LUCENE_4_9, indexer.buildStopwordList("stopfile"));

    Analyzer webdocAnalyzer = new WebDocAnalyzer(indexer.getProperties(), stopList);
    TokenStream stream = webdocAnalyzer.tokenStream("field", new StringReader(html));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    stream.reset();
    while (stream.incrementToken()) {
        String token = termAtt.toString();
        Integer tf = tfMap.get(token);
        if (tf == null) {
            tf = new Integer(0);
        }/*w  w w  .  java  2s  .com*/
        tf++;
        tfMap.put(token, tf);
    }

    stream.end();
    stream.close();

    for (Map.Entry<String, Integer> e : tfMap.entrySet()) {
        String word = e.getKey();
        int tf = e.getValue();
        if (tf >= freqCutoffThreshold) {
            for (int i = 0; i < tf; i++) { // print this word tf times... word order doesn't matter!
                buff.append(word).append(" ");
            }
        }
    }
    return buff.toString();
}

From source file:workTextIndexService.Procesamiento.java

public String normalizar(String texto) {
    resultN = "";
    @SuppressWarnings("deprecation")
    SpanishAnalyzer analyzer = new SpanishAnalyzer(Version.LUCENE_4_10_1);
    try {/*from  w w w  .ja  v  a  2 s . c o m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(texto));
        stream.reset();
        while (stream.incrementToken()) {
            resultN = resultN + (stream.getAttribute(CharTermAttribute.class).toString()) + " ";
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return resultN.toLowerCase();
}

From source file:ws.project.languagebasedlexiconanalisys.TwitterStreamAnalizer.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {//from  w w w.ja va2  s . c  o m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;

}