Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {//from w ww.j a  v  a 2s  .  c o  m
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//  ww w  .j a v a  2  s .c  om
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.wiseowl.WiseOwl.query.WiseOwlQParser.java

License:Apache License

@Override
public Query parse() throws SyntaxError {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*from  ww w  .jav a2 s . c om*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    // String type = "P";
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    if (mt.equals("DESCRIPTION")) {
        BooleanQuery bq;
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        //BooleanQuery bq=new BooleanQuery(false, 0);
        String field = "text";
        SchemaField sf = req.getSchema().getFieldOrNull(field);
        try {
            Analyzer analyzer = sf.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;
            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                //ts.reset();
                //log.warn("terms {} ",term);
                builder.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD);
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        bq = builder.build();
        return bq;
        //return new TermQuery(new Term("title", "she"));

    } else {
        //<end id="qqp.answerType"/>
        String field = "text";
        //params.get(QUERY_FIELD);
        //String field="text";
        SchemaField sp = req.getSchema().getFieldOrNull(field);
        if (sp == null) {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
        }
        //<start id="qqp.query"/>
        List<SpanQuery> sql = new ArrayList<SpanQuery>();
        if (mt != null) {//<co id="qqp.handleAT"/>
            String[] parts = mt.split("\\|");
            if (parts.length == 1) {
                sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
            } else {
                for (int pi = 0; pi < parts.length; pi++) {
                    sql.add(new SpanTermQuery(new Term(field, parts[pi].toLowerCase())));
                }
            }
        }
        log.warn("answer type mt : {} {} ", mt, type);
        FocusNoun fn = new FocusNoun();
        String fnn[] = null;
        try {
            fnn = fn.getFocusNoun(qstr);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
        try {
            Analyzer analyzer = sp.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;

            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                log.warn("terms boosted {} ", term);
                if (fnn != null)
                    if (term.equals(fnn[0]) || term.equals(fnn[1])) {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 100f));
                    } else {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 5f));
                    }

                // sql.add(new SpanTermQuery(new Term(field, term)));
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        return new SpanOrQuery(sql.toArray(new SpanQuery[sql.size()]));
        // return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(OWLParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
        /*
        <calloutlist>
            <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
            <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
            <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
        </calloutlist>
        */
        //<end id="qqp.query"/>

    }
}

From source file:com.xiaomi.linden.lucene.analyzer.CommonMMSeg4jSegmenter.java

License:Apache License

@Override
public List<Term> parse(String content) throws Exception {
    List<Term> words = new ArrayList<>();
    if (content == null || content.isEmpty()) {
        return words;
    }//from   w  w w. j  av  a  2 s . co  m

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream("", content);
        stream.reset();
        if (stopWords != null) {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(new StopFilter(stream, stopWords));
            } else {
                stream = new StopFilter(stream, stopWords);
            }
        } else {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(stream);
            }
        }
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
        while (stream.incrementToken()) {
            words.add(
                    new Term(termAttr.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
        }
    } catch (IOException e) {
        throw new Exception(content + " extract words from phrase failed!", e);
    } finally {
        if (stream != null) {
            stream.close();
        }
    }
    return words;
}

From source file:com.xiaomi.linden.lucene.analyzer.TestLindenWordDelimiterAnalyzer.java

License:Apache License

@Test
public void testLindenWordDelimiterAnalyzer() throws Exception {
    LindenWordDelimiterAnalyzerFactory wordDelimiterAnalyzerFactory = new LindenWordDelimiterAnalyzerFactory();
    Map<String, String> args = new HashMap<>();
    Map<String, String> lastargs = new HashMap<>();
    args.put("luceneMatchVersion", "LUCENE_4_10_0");
    lastargs.putAll(args);/*w ww.  jav  a  2 s . c  om*/
    Analyzer analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    TokenStream stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    String expected = "[hello][test][case][][][][][][][][][][][][created][2018][sls][sun][li][shun][sun][li][shun]";
    String out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("lower.case", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][test][case][][][][][][][][][][][][created][2018][sls][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("set.stopwords", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][on][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.putAll(lastargs);
    args.put("splitOnCaseChange", "0");
    args.put("set.stopwords", "false");
    args.put("lower.case", "true");
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    expected = "[hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][sun][li][shun][sunlishun]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);
}

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleQuery.java

License:Apache License

private List<SegToken> parseToTokens(String content, float boost) throws IOException {
    List<SegToken> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("", new StringReader(content));
    try {/* w w w .  j  a va  2  s.  c om*/
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(new SegToken(term.toString(), boost));
        }
    } finally {
        if (stream != null)
            stream.close();
    }
    return tokens;
}

From source file:com.zb.mmseg.analysis.TokenUtils.java

License:Open Source License

/**
 * @param input/*from   w w  w.  ja  va2s .  c o  m*/
 * @param reusableToken is null well new one auto.
 * @return null - if not next token or input is null.
 * @throws IOException
 */
public static Token nextToken(TokenStream input, Token reusableToken) throws IOException {
    if (input == null) {
        return null;
    }
    if (!input.incrementToken()) {
        return null;
    }

    CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class);

    if (reusableToken == null) {
        reusableToken = new Token();
    }

    reusableToken.clear();
    if (termAtt != null) {
        // lucene 3.0
        // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
        // lucene 3.1
        reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if (offsetAtt != null) {
        // lucene 3.1
        // reusableToken.setStartOffset(offsetAtt.startOffset());
        // reusableToken.setEndOffset(offsetAtt.endOffset());
        // lucene 4.0
        reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
    }

    if (typeAtt != null) {
        reusableToken.setType(typeAtt.type());
    }

    return reusableToken;
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//from   ww w.  java2 s .  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:ddf.catalog.pubsub.criteria.contextual.ContextualEvaluator.java

License:Open Source License

private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName)
        throws IOException {
    if (!LOGGER.isDebugEnabled()) {
        return;//from  www  .  j  a v a2 s. co m
    }

    TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
    LOGGER.debug("-----  {} tokens  -----", analyzerName);
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = termAttribute.term();
        LOGGER.debug(term);
    }
    LOGGER.debug("-----  END:  {} tokens  -----", analyzerName);
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {/*from   w ww .  ja v a  2 s  .  c  o m*/
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}