Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:io.anserini.analysis.TweetTokenizationTest.java

License:Apache License

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/* w ww.  j  a v  a 2s.  c o m*/
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();

    return list;
}

From source file:io.bdrc.lucene.bo.TibetanAnalyzerTest.java

License:Apache License

static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
    try {/*  ww  w.  ja  v  a 2s  . c o  m*/
        List<String> termList = new ArrayList<String>();
        // CharTermAttribute charTermAttribute =
        // tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
        while (tokenStream.incrementToken()) {
            int start = offsetAttr.startOffset();
            int end = offsetAttr.endOffset();
            termList.add(inputStr.substring(start, end));
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}

From source file:io.bdrc.lucene.bo.TibetanAnalyzerTest.java

License:Apache License

static private void assertTokenStream(TokenStream tokenStream, List<String> expected) {
    try {/*from w  w  w  . ja v  a2  s  .  c  o m*/
        List<String> termList = new ArrayList<String>();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            termList.add(charTermAttribute.toString());
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}

From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }/*from w w  w  .  jav a  2 s . c  om*/

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //  log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(idDoc);
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setIdNeo4j(-1l);
                a.setPageNum(-1l);
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.ilc.lc.clavius.search.ClaviusTokenGroup.java

public ClaviusTokenGroup(TokenStream tokenStream) {
    super(tokenStream);
    this.offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    this.termAtt = tokenStream.addAttribute(CharTermAttribute.class);
}

From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }/*from w  w w .java2s  .c o  m*/

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(document.get("idDoc"));
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setPageNum(-1l);
                // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j")));
                a.setIdNeo4j(Long.parseLong(document.get("idDoc")));
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String str = "<body>perch";
    Analyzer anal = new DexterAnalyzer();
    TokenStream ts = anal.tokenStream("content", new StringReader(str));

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from  w  w w  .  ja  va  2  s .c o m*/
    while (ts.incrementToken()) {
        System.out.println(termAtt.toString().substring(0, termAtt.length()));
        System.out.println("token start offset: " + offsetAtt.startOffset());
        System.out.println("  token end offset: " + offsetAtt.endOffset());
    }
}

From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java

License:Apache License

public String clean(String spot) throws IOException {
    try {//www  .  j  a va  2  s.co  m
        spot = URLDecoder.decode(spot, "UTF-8");
    } catch (IllegalArgumentException e) {

    }

    analyzer.lowercase(spot.length() > 4);

    TokenStream ts = analyzer.tokenStream("content", new StringReader(spot));

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    sb.setLength(0);
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
        sb.append(termAtt.toString());
        sb.append(' ');
        if (tokens > maxSpotLength) {
            return "";
        }
    }
    ts.end();
    ts.reset();
    if (sb.length() > 0)
        sb.setLength(sb.length() - 1);
    // System.out.println(spot + " -> " + "[" + sb.toString() + "]");
    String finalSpot = sb.toString();
    for (Filter<String> filter : filters) {
        if (filter.isFilter(finalSpot)) {
            finalSpot = "";
        }
    }
    return finalSpot;
}

From source file:it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java

License:Apache License

private void initBloomFilter(Iterator<String> spotIterator) {
    String spot = spotIterator.next();
    analyzer.setShingles(false);//from   w w w. j ava 2s  .  co m

    ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000);
    pl.up();
    while (spotIterator.hasNext()) {
        String next = spotIterator.next();
        if (next.equals(spot))
            continue;
        pl.up();
        spot = next;
        TokenStream ts = null;
        try {
            ts = analyzer.tokenStream("content", new StringReader(spot));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        try {
            ts.reset();

            if (ts.incrementToken()) {
                spot = termAtt.toString();
                bf.add(spot);

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

From source file:it.unibz.instasearch.indexing.StorageIndexer.java

License:Open Source License

/**
 * Extracts terms from text//from w  w w  .j ava2 s .c  o m
 * 
 * @param text
 * @return a map of terms to their offsets in text
 * @throws IOException
 */
public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
    Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
    TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));

    TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);

    while (tokenStream.incrementToken()) {
        String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase();
        int offset = offsetAtt.startOffset();

        List<Integer> offsets = terms.get(termText);

        if (offsets == null) {
            offsets = new LinkedList<Integer>();
            terms.put(termText, offsets);
        }

        offsets.add(offset);
    }
    tokenStream.close();

    return terms;
}