Example usage for org.apache.lucene.analysis.tokenattributes CharTermAttribute setEmpty

List of usage examples for org.apache.lucene.analysis.tokenattributes CharTermAttribute setEmpty

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.tokenattributes CharTermAttribute setEmpty.

Prototype

public CharTermAttribute setEmpty();

Source Link

Document

Sets the length of the termBuffer to zero.

Usage

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }//from  w  ww .  j a  v  a 2s  .c o m

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *//*from  w w w . j ava2  s .  c  om*/
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    while (in.incrementToken()) {
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        out.append(' ');
    }
    if (out.length() > 0)
        out.deleteCharAt(out.length() - 1);

    in.close();
    return out.toString();
}

From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(
            Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();
        }//from  ww  w.  j a  va  2 s .  c o m
    }
}

From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }//w  ww  .  j a va  2 s.  c o  m

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }
        if (input.size() > 6) {
            langCode = (String) input.get(6);
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    stopwords, stem, outputUnigrams, minNGram, maxNGram);
        } else {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from  ww w .j a  v a 2 s. c  om*/

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        if (input.size() > 3) {
            langCode = (String) input.get(3);
        }

        if (langCode.startsWith("zh") || langCode.startsWith("ja")) {
            analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("de")) {
            analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("es")) {
            analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31);
        } else {
            if (stopwords != null && stopwords.size() > 0) {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem);
            } else {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem);
            }
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // TODO: SMELLY: de-system-ify
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();
        }/*  ww  w .j ava2s.co  m*/
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }//from w w w  .j  av a2 s  .c o  m

    if (analyzer == null) {
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram,
                    maxNGram);
        } else {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem,
                    outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.sindicetech.siren.analysis.attributes.NodeNumericTermAttributeImpl.java

License:Open Source License

public boolean incrementShift(final CharTermAttribute termAtt) {
    // check if we reach end of the stream
    if (shift >= valueSize) {
        return false;
    }/* w  ww. j  a  va2  s. c om*/

    try {
        // generate the next token and update the char term attribute
        this.bytesRefToChar(termAtt);
        // increment shift for next token
        shift += precisionStep;
        return true;
    } catch (final IllegalArgumentException iae) {
        // return empty token before first or after last
        termAtt.setEmpty();
        // ends the numeric tokenstream
        shift = valueSize;
        return false;
    }
}

From source file:com.underthehood.weblogs.lucene.AutoPhrasingTokenFilter.java

License:Apache License

private void emit(char[] token) {
    //System.out.println("emit: " + new String(token));
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }/*  ww  w. j a v a  2 s. c  o  m*/
    CharTermAttribute termAttr = getTermAttribute();
    termAttr.setEmpty();
    termAttr.append(new StringBuilder().append(token));

    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }

    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }

    lastEmitted = token;
}

From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *//*from  w ww .j a va 2  s  .  c  o m*/
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
        if (out.length() > 0)
            out.append(' ');
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
}