Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {//from   w w  w.  java 2s. c  o  m
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java

License:Apache License

/**
 *  Retrieve the tokens in a String. Behaves like getTokens, but operates on
 *  a string instead of a tweet object.//from   w w  w  . j  a  v  a 2  s. co m
 * 
 *  @param  text    The text to tokenize.
 *  @return         The tokens in the text.
 */

// Version 1
/*public LinkedList<String> getTokens (String text) {
LinkedList<String> tokens   = new LinkedList();
String[] words              = text.split(" ");
tokens.addAll(Arrays.asList(words));
return tokens;
}*/

// Version 2
public static LinkedList<String> getTokens(String text) throws IOException {
    LinkedList<String> tokens = new LinkedList();
    TokenStream ts = new StandardTokenizer(new StringReader(text));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(termAtt.term());
        //System.out.print(termAtt.term());
    }
    return tokens;
}

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }//  w  ww .j a  v  a 2  s.c  om

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private String analyzeQuery(String query, Analyzer analyzer) {

    if (analyzer != null && query != null && query.length() > 0) {
        TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query));

        StringBuilder newQueryB = new StringBuilder();
        try {// www  .j av a 2  s. co  m
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
                // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
                // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

                newQueryB.append(term.toString());
                newQueryB.append(' ');

            }
            tokenStream.end();
            return newQueryB.toString().trim();

        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {
                throw new RuntimeException("uncaught exception in synonym processing", e);
            }
        }
    }

    return query;

}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * /*from w ww  .j av  a  2 s  . c  om*/
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readBaseForm(TokenStream tokenStream, LuceneToken token) {
    BaseFormAttribute baseForm = tokenStream.getAttribute(BaseFormAttribute.class);
    if (baseForm != null) {
        token.setBaseForm(baseForm.getBaseForm());
    }//from w  w  w  . jav  a 2  s  . co  m
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readInflection(TokenStream tokenStream, LuceneToken token) {
    InflectionAttribute inflection = tokenStream.getAttribute(InflectionAttribute.class);
    if (inflection != null) {
        token.setInflectionForm(LuceneUtil.translateInflectedForm(inflection.getInflectionForm()));
        token.setInflectionType(LuceneUtil.translateInflectionType(inflection.getInflectionType()));
    }// ww w.ja  v  a  2 s. c  o  m
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readPartOfSpeech(TokenStream tokenStream, LuceneToken token) {
    PartOfSpeechAttribute partOfSpeech = tokenStream.getAttribute(PartOfSpeechAttribute.class);
    if (partOfSpeech != null) {
        String str = partOfSpeech.getPartOfSpeech();
        if (str != null) {
            token.setPartOfSpeech(LuceneUtil.translatePartOfSpeech(str));
        }//from   w w w  .  ja  v  a2s  .  c om
    }
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readReading(TokenStream tokenStream, LuceneToken token) {
    ReadingAttribute reading = tokenStream.getAttribute(ReadingAttribute.class);
    if (reading != null) {
        token.setPronunciation(reading.getPronunciation());
        token.setReading(reading.getReading());
    }/* w  ww .jav a  2 s. c  om*/
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void readOffset(TokenStream tokenStream, LuceneToken token) {
    OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
    if (offset != null) {
        token.setStartOffset(offset.startOffset());
        token.setEndOffset(offset.endOffset());
    }/*w  w  w .j  a  v a2s .com*/
}