Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {//from w ww.j a  v  a 2s  .  c o  m
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//  ww w  .j a v a  2  s .c  om
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.wiseowl.WiseOwl.query.WiseOwlQParser.java

License:Apache License

@Override
public Query parse() throws SyntaxError {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*from  ww w  .jav a2 s . c om*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    // String type = "P";
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    if (mt.equals("DESCRIPTION")) {
        BooleanQuery bq;
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        //BooleanQuery bq=new BooleanQuery(false, 0);
        String field = "text";
        SchemaField sf = req.getSchema().getFieldOrNull(field);
        try {
            Analyzer analyzer = sf.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;
            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                //ts.reset();
                //log.warn("terms {} ",term);
                builder.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD);
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        bq = builder.build();
        return bq;
        //return new TermQuery(new Term("title", "she"));

    } else {
        //<end id="qqp.answerType"/>
        String field = "text";
        //params.get(QUERY_FIELD);
        //String field="text";
        SchemaField sp = req.getSchema().getFieldOrNull(field);
        if (sp == null) {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
        }
        //<start id="qqp.query"/>
        List<SpanQuery> sql = new ArrayList<SpanQuery>();
        if (mt != null) {//<co id="qqp.handleAT"/>
            String[] parts = mt.split("\\|");
            if (parts.length == 1) {
                sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
            } else {
                for (int pi = 0; pi < parts.length; pi++) {
                    sql.add(new SpanTermQuery(new Term(field, parts[pi].toLowerCase())));
                }
            }
        }
        log.warn("answer type mt : {} {} ", mt, type);
        FocusNoun fn = new FocusNoun();
        String fnn[] = null;
        try {
            fnn = fn.getFocusNoun(qstr);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
        try {
            Analyzer analyzer = sp.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;

            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                log.warn("terms boosted {} ", term);
                if (fnn != null)
                    if (term.equals(fnn[0]) || term.equals(fnn[1])) {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 100f));
                    } else {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 5f));
                    }

                // sql.add(new SpanTermQuery(new Term(field, term)));
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        return new SpanOrQuery(sql.toArray(new SpanQuery[sql.size()]));
        // return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(OWLParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
        /*
        <calloutlist>
            <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
            <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
            <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
        </calloutlist>
        */
        //<end id="qqp.query"/>

    }
}

From source file:com.xiaomi.linden.lucene.analyzer.CommonMMSeg4jSegmenter.java

License:Apache License

@Override
public List<Term> parse(String content) throws Exception {
    List<Term> words = new ArrayList<>();
    if (content == null || content.isEmpty()) {
        return words;
    }//from   w  w w. j  av  a  2 s . co  m

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream("", content);
        stream.reset();
        if (stopWords != null) {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(new StopFilter(stream, stopWords));
            } else {
                stream = new StopFilter(stream, stopWords);
            }
        } else {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(stream);
            }
        }
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
        while (stream.incrementToken()) {
            words.add(
                    new Term(termAttr.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
        }
    } catch (IOException e) {
        throw new Exception(content + " extract words from phrase failed!", e);
    } finally {
        if (stream != null) {
            stream.close();
        }
    }
    return words;
}

From source file:com.xiaomi.linden.lucene.analyzer.TestLindenWordDelimiterAnalyzer.java

License:Apache License

@Test
public void testLindenWordDelimiterAnalyzer() throws Exception {
    LindenWordDelimiterAnalyzerFactory wordDelimiterAnalyzerFactory = new LindenWordDelimiterAnalyzerFactory();
    Map<String, String> args = new HashMap<>();
    Map<String, String> lastargs = new HashMap<>();
    args.put("luceneMatchVersion", "LUCENE_4_10_0");
    lastargs.putAll(args);/*w ww.  jav  a  2 s . c  om*/
    Analyzer analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    TokenStream stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    String expected = "[hello][test][case][][][][][][][][][][][][created][2018][sls][sun][li][shun][sun][li][shun]";
    String out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("lower.case", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][test][case][][][][][][][][][][][][created][2018][sls][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("set.stopwords", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][on][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.putAll(lastargs);
    args.put("splitOnCaseChange", "0");
    args.put("set.stopwords", "false");
    args.put("lower.case", "true");
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    expected = "[hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][sun][li][shun][sunlishun]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);
}

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleQuery.java

License:Apache License

private List<SegToken> parseToTokens(String content, float boost) throws IOException {
    List<SegToken> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("", new StringReader(content));
    try {/* w w w .  j  a va  2  s.  c om*/
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(new SegToken(term.toString(), boost));
        }
    } finally {
        if (stream != null)
            stream.close();
    }
    return tokens;
}

From source file:com.zb.mmseg.analysis.TokenUtils.java

License:Open Source License

/**
 * @param input/*from   w w  w.  ja  va2s .  c o  m*/
 * @param reusableToken is null well new one auto.
 * @return null - if not next token or input is null.
 * @throws IOException
 */
public static Token nextToken(TokenStream input, Token reusableToken) throws IOException {
    if (input == null) {
        return null;
    }
    if (!input.incrementToken()) {
        return null;
    }

    CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class);

    if (reusableToken == null) {
        reusableToken = new Token();
    }

    reusableToken.clear();
    if (termAtt != null) {
        // lucene 3.0
        // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
        // lucene 3.1
        reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if (offsetAtt != null) {
        // lucene 3.1
        // reusableToken.setStartOffset(offsetAtt.startOffset());
        // reusableToken.setEndOffset(offsetAtt.endOffset());
        // lucene 4.0
        reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
    }

    if (typeAtt != null) {
        reusableToken.setType(typeAtt.type());
    }

    return reusableToken;
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//from   ww w.  java2 s .  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:ddf.catalog.pubsub.criteria.contextual.ContextualEvaluator.java

License:Open Source License

private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName)
        throws IOException {
    if (!LOGGER.isDebugEnabled()) {
        return;//from  www  .  j  a v a2 s. co m
    }

    TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
    LOGGER.debug("-----  {} tokens  -----", analyzerName);
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = termAttribute.term();
        LOGGER.debug(term);
    }
    LOGGER.debug("-----  END:  {} tokens  -----", analyzerName);
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {/*from   w ww .  ja v a  2 s  .  c  o m*/
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}