Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {//from   w w  w.  java 2s. c  o  m
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java

License:Apache License

/**
 *  Retrieve the tokens in a String. Behaves like getTokens, but operates on
 *  a string instead of a tweet object.//from   w w  w  . j  a  v  a 2  s. co m
 * 
 *  @param  text    The text to tokenize.
 *  @return         The tokens in the text.
 */

// Version 1
/*public LinkedList<String> getTokens (String text) {
LinkedList<String> tokens   = new LinkedList();
String[] words              = text.split(" ");
tokens.addAll(Arrays.asList(words));
return tokens;
}*/

// Version 2
public static LinkedList<String> getTokens(String text) throws IOException {
    LinkedList<String> tokens = new LinkedList();
    TokenStream ts = new StandardTokenizer(new StringReader(text));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(termAtt.term());
        //System.out.print(termAtt.term());
    }
    return tokens;
}

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }//  w  ww .j a  v  a 2  s.c  om

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private String analyzeQuery(String query, Analyzer analyzer) {

    if (analyzer != null && query != null && query.length() > 0) {
        TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query));

        StringBuilder newQueryB = new StringBuilder();
        try {// www  .j av a 2  s. co  m
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
                // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
                // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

                newQueryB.append(term.toString());
                newQueryB.append(' ');

            }
            tokenStream.end();
            return newQueryB.toString().trim();

        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {
                throw new RuntimeException("uncaught exception in synonym processing", e);
            }
        }
    }

    return query;

}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * /*from w ww  .j av  a  2 s  . c  om*/
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readBaseForm(TokenStream tokenStream, LuceneToken token) {
    BaseFormAttribute baseForm = tokenStream.getAttribute(BaseFormAttribute.class);
    if (baseForm != null) {
        token.setBaseForm(baseForm.getBaseForm());
    }//from w  w  w  . jav  a 2  s  . co  m
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readInflection(TokenStream tokenStream, LuceneToken token) {
    InflectionAttribute inflection = tokenStream.getAttribute(InflectionAttribute.class);
    if (inflection != null) {
        token.setInflectionForm(LuceneUtil.translateInflectedForm(inflection.getInflectionForm()));
        token.setInflectionType(LuceneUtil.translateInflectionType(inflection.getInflectionType()));
    }// ww w.ja  v  a  2 s. c  o  m
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readPartOfSpeech(TokenStream tokenStream, LuceneToken token) {
    PartOfSpeechAttribute partOfSpeech = tokenStream.getAttribute(PartOfSpeechAttribute.class);
    if (partOfSpeech != null) {
        String str = partOfSpeech.getPartOfSpeech();
        if (str != null) {
            token.setPartOfSpeech(LuceneUtil.translatePartOfSpeech(str));
        }//from   w w w  .  ja  v  a2s  .  c om
    }
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readReading(TokenStream tokenStream, LuceneToken token) {
    ReadingAttribute reading = tokenStream.getAttribute(ReadingAttribute.class);
    if (reading != null) {
        token.setPronunciation(reading.getPronunciation());
        token.setReading(reading.getReading());
    }/* w  ww .jav a  2 s. c  om*/
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readOffset(TokenStream tokenStream, LuceneToken token) {
    OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
    if (offset != null) {
        token.setStartOffset(offset.startOffset());
        token.setEndOffset(offset.endOffset());
    }/*w  w  w .j  a  v a2s .com*/
}