Example usage for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.talis.lucene.analysis.Utils.java

License:Apache License

public static void assertStopWord(Analyzer a, String input) throws IOException {
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    assertNull(ts.next());// ww w  . j a v  a 2  s.c  om
    ts.close();
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//from w  ww .ja v  a  2s  . c  om
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // text to tokenize
    //final String text = "  ? ?";
    //String text = " ,?, ?";
    String text = " ??.   . DB ?  ? ?? , ? ? , , ?,  ? ... ? ?  ? ? ?.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();//from   w w  w .j  a va2s  .  co m

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":"
                + offsetAtt.endOffset() + ")");
    }

    stream.end();
    stream.close();
}

From source file:com.wiseowl.WiseOwl.query.WiseOwlQParser.java

License:Apache License

@Override
public Query parse() throws SyntaxError {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*from  w  w  w  .ja  va2s .  c o  m*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    // String type = "P";
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    if (mt.equals("DESCRIPTION")) {
        BooleanQuery bq;
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        //BooleanQuery bq=new BooleanQuery(false, 0);
        String field = "text";
        SchemaField sf = req.getSchema().getFieldOrNull(field);
        try {
            Analyzer analyzer = sf.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;
            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                //ts.reset();
                //log.warn("terms {} ",term);
                builder.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD);
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        bq = builder.build();
        return bq;
        //return new TermQuery(new Term("title", "she"));

    } else {
        //<end id="qqp.answerType"/>
        String field = "text";
        //params.get(QUERY_FIELD);
        //String field="text";
        SchemaField sp = req.getSchema().getFieldOrNull(field);
        if (sp == null) {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
        }
        //<start id="qqp.query"/>
        List<SpanQuery> sql = new ArrayList<SpanQuery>();
        if (mt != null) {//<co id="qqp.handleAT"/>
            String[] parts = mt.split("\\|");
            if (parts.length == 1) {
                sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
            } else {
                for (int pi = 0; pi < parts.length; pi++) {
                    sql.add(new SpanTermQuery(new Term(field, parts[pi].toLowerCase())));
                }
            }
        }
        log.warn("answer type mt : {} {} ", mt, type);
        FocusNoun fn = new FocusNoun();
        String fnn[] = null;
        try {
            fnn = fn.getFocusNoun(qstr);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
        try {
            Analyzer analyzer = sp.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;

            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                log.warn("terms boosted {} ", term);
                if (fnn != null)
                    if (term.equals(fnn[0]) || term.equals(fnn[1])) {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 100f));
                    } else {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 5f));
                    }

                // sql.add(new SpanTermQuery(new Term(field, term)));
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        return new SpanOrQuery(sql.toArray(new SpanQuery[sql.size()]));
        // return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(OWLParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
        /*
        <calloutlist>
            <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
            <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
            <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
        </calloutlist>
        */
        //<end id="qqp.query"/>

    }
}

From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?truesmart??
    Analyzer analyzer = new IKAnalyzer(false);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {/*from   w w  w  .  j a  v  a 2  s  . c o  m*/
        ts = analyzer.tokenStream("myfield",
                new StringReader("??????"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            //???
            if (term.toString().length() == 1) {
                continue;
            }
            //?????

            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.xiaomi.linden.lucene.analyzer.CommonMMSeg4jSegmenter.java

License:Apache License

@Override
public List<Term> parse(String content) throws Exception {
    List<Term> words = new ArrayList<>();
    if (content == null || content.isEmpty()) {
        return words;
    }/* w ww  .  j  a  v a 2s  .c  om*/

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream("", content);
        stream.reset();
        if (stopWords != null) {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(new StopFilter(stream, stopWords));
            } else {
                stream = new StopFilter(stream, stopWords);
            }
        } else {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(stream);
            }
        }
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
        while (stream.incrementToken()) {
            words.add(
                    new Term(termAttr.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
        }
    } catch (IOException e) {
        throw new Exception(content + " extract words from phrase failed!", e);
    } finally {
        if (stream != null) {
            stream.close();
        }
    }
    return words;
}

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleQuery.java

License:Apache License

private List<SegToken> parseToTokens(String content, float boost) throws IOException {
    List<SegToken> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("", new StringReader(content));
    try {// w  w  w.  j  a v  a  2  s. c  om
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(new SegToken(term.toString(), boost));
        }
    } finally {
        if (stream != null)
            stream.close();
    }
    return tokens;
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {//w  ww  .j  a  va  2s.  co m
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {//from  www . j a  va 2 s . com
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {/*from   www .  j a v  a  2 s. co  m*/
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}