Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:lucli.LuceneMethods.java

License:Apache License

private void invertDocument(Document doc) throws IOException {

    Map tokenMap = new HashMap();
    final int maxFieldLength = 10000;

    Analyzer analyzer = createAnalyzer();
    Iterator fields = doc.getFields().iterator();
    final Token reusableToken = new Token();
    while (fields.hasNext()) {
        Field field = (Field) fields.next();
        String fieldName = field.name();

        if (field.isIndexed()) {
            if (field.isTokenized()) { // un-tokenized field
                Reader reader; // find or make Reader
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");

                int position = 0;
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream
                        .addAttribute(PositionIncrementAttribute.class);

                try {
                    while (stream.incrementToken()) {
                        position += (posIncrAtt.getPositionIncrement() - 1);
                        position++;/*from w  ww.ja va2  s .c om*/
                        String name = termAtt.term();
                        Integer Count = (Integer) tokenMap.get(name);
                        if (Count == null) { // not in there yet
                            tokenMap.put(name, new Integer(1)); //first one
                        } else {
                            int count = Count.intValue();
                            tokenMap.put(name, new Integer(count + 1));
                        }
                        if (position > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }

        }
    }
    Entry[] sortedHash = getSortedMapEntries(tokenMap);
    for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
        Entry currentEntry = sortedHash[ii];
        message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue());
    }
}

From source file:lux.search.highlight.XmlHighlighter.java

License:Mozilla Public License

/**
 * inspired by org.apache.lucene.search.highlight.Highlighter *
 * //from  w  w w .ja  v  a 2  s .co  m
 * send highlighted events to the writer
 * @param reader the input document stream
 * @param characterOffset beginning of the text to highlight
 * @param textLength length of the text to highlight
 * @throws XMLStreamException 
 */
private void highlightTextNode() throws IOException, XMLStreamException {
    TokenStream tokenStream = analyzer.tokenStream(textFieldName, textReader);
    xmlStreamTokens.reset(tokenStream);
    lastEndOffset = 0;
    for (boolean next = xmlStreamTokens.incrementToken(); next
            && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = xmlStreamTokens.incrementToken()) {
        if (scorerTokens != null && xmlStreamTokens.isPlainToken()) {
            scorerTokens.incrementToken();
        }
        if (tokenGroup.isDistinct()) {
            // write out any accumulated tokens
            handleTokenGroup();
            tokenGroup.clear();
        }
        if (scorerTokens == null || xmlStreamTokens.isPlainToken()) {
            tokenGroup.addToken(scorer.getTokenScore());
        }
    }
    handleTokenGroup();
    tokenGroup.clear();
    writeTrailingText();
    tokenStream.end();
    tokenStream.close();
}

From source file:net.mad.ads.server.utils.http.KeywordUtils.java

License:Open Source License

public static List<String> getTokens(String queryString) {
    try {//from  ww w  . j av a  2 s.  co  m
        GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33);

        TokenStream ts = a.tokenStream("", new StringReader(queryString));

        List<String> tokens = new ArrayList<String>();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            tokens.add(token);
        }
        ts.end();
        ts.close();

        return tokens;
    } catch (IOException e) {
        logger.error("", e);
    }
    return null;
}

From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java

License:Open Source License

private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException {
    TokenStream ts = analyzer.tokenStream(fld, new StringReader(val));
    try {//from   w  ww .  j a  va 2s  .c o  m
        ts.reset();
        // Iterate over tokens and treat each token as term
        int pos = 0;
        while (ts.incrementToken()) {
            CharTermAttribute t = ts.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class);
            pos += p.getPositionIncrement();
            phrase.add(new Term(fld, t.toString()), pos - 1);
        }
        // End-of-stream clean-up
        ts.end();
    } finally {
        ts.close();
    }
}

From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java

License:Open Source License

public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max,
        int threshold, HashMap<String, String> attributes) {
    float searchThreshold = (float) threshold;
    if (threshold < 0)
        searchThreshold = 0.0f;//from  www  . ja  va  2 s.  c  o  m
    if (threshold > 100)
        searchThreshold = 100.0f;

    String queryText = genericText;

    String gtextFName = TmEntry.GTEXT_PREFIX + locale;
    Locale javaLoc = new Locale(locale);

    // create basic ngram analyzer to tokenize query
    TokenStream queryTokenStream;
    if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) {
        queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText));
    } else {
        queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText));
    }

    // Get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class);
    TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName);

    try {
        queryTokenStream.reset();
        while (queryTokenStream.incrementToken()) {
            //Term t = new Term(keyIndexField, new String(termAtt.buffer()));
            Term t = new Term(gtextFName, termAtt.toString());
            fQuery.add(t);
        }
        queryTokenStream.end();
        queryTokenStream.close();
    } catch (IOException e) {
        throw new OkapiIOException(e.getMessage(), e);
    }

    return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes);
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public String[] getQueryTokens(final String queryString) {
    TokenStream tokenStream = null;
    try {/*from   w w  w  .  jav a 2 s.c  o m*/
        tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString));
        tokenStream.reset();
        final ArrayList<String> al = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            final String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
            if (term != null && term.length() > 1) {
                al.add(term);
            }
        }
        if (al.size() == 0) {
            al.add(queryString);
        }

        return al.toArray(new String[al.size()]);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (final IOException e) {
            }
        }
    }
}

From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java

@Test
public void test() throws IOException {
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }//from   ww  w.  j  a  v  a  2 s  . co m
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}

From source file:net.skyatlas.icd.test.AnsegTest.java

static public void main(String[] args)
        throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException {
    AnsegTest inst = new AnsegTest();
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }/*from w  w  w . j  a va2s. c o  m*/
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");

    HashSet<String> hs = new HashSet<String>();
    BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"),
            "UTF-8");
    String word = null;
    while ((word = reader2.readLine()) != null) {
        hs.add(word);
    }
    Analyzer analyzer = new AnsjAnalysis(hs, false);
    Directory directory = null;
    IndexWriter iwriter = null;

    BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8");
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = reader.readLine()) != null) {
        sb.append(temp);
        sb.append("\n");
    }
    reader.close();
    String text = sb.toString();

    text = "????????????  ??? ????????";

    IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer);
    // 
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    // BufferedReader reader =
    // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt",
    // "GBK");
    // String temp = null;
    // while ((temp = reader.readLine()) != null) {
    // addContent(iwriter, temp);
    // }
    inst.addContent(iwriter, "?   ?()   (?)");
    inst.addContent(iwriter, "   ?()   (?)");
    inst.addContent(iwriter, "?   ?   (?)");
    inst.addContent(iwriter, "   ??NEC   ");
    inst.addContent(iwriter, "?");
    iwriter.commit();
    iwriter.close();

    System.out.println("");

    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "?");

    /*
     KeyWordComputer kwc = new KeyWordComputer(5);
     String title = "??";
     String content = "9??"
     + "?????????"
     + "????"
     + "??"
     + "?????"
     + "???"
     + "??????"
     + "???"
     + "????20??"
     + "????"
     + "?"
     + "???]??"
     + "???";
     Collection<Keyword> result = kwc.computeArticleTfidf(title, content);
     System.out.println(result);
            
     AnsegTest t = new AnsegTest();
     List<Term> parse = ToAnalysis.parse("?");
     System.out.println(parse);
     System.out.println("*********** ? ************");
     //        UserDefineLibrary.insertWord("", "userDefine", 1000);
     //        UserDefineLibrary.insertWord("?", "userDefine", 1000);
     UserDefineLibrary.insertWord("?", "userDefine", 1000);
     parse = ToAnalysis.parse("???");
     System.out.println(parse);
     */
}

From source file:net.strong.weblucene.search.WebLuceneHighlighter.java

License:Apache License

/**
 * Return highlighted string//from www  .  j a  v a2  s. c o m
 *
 * @param srcString source string need to highlight
 *
 * @return highlighted string
 */
public String highlight(String srcString) {
    if ((srcString == null) || srcString.trim().equals("")) {
        return "";
    }

    int srcLength = srcString.length();

    //truncate src to maxRetrunSize
    if (srcLength >= maxBufferSize) {
        srcString = srcString.substring(0, maxBufferSize);
        srcLength = maxBufferSize;
    }

    //return src if no term to highlight
    if (terms.size() == 0) {
        return srcString.substring(0, maxReturnSize);
    }

    try {
        //reset buffer and last term offset
        //default previous token end place
        int prevEnd = 0;
        srcBuffer = new char[srcLength];

        StringReader stringReader = new StringReader(srcString);
        stringReader.read(srcBuffer);

        StringReader sr = new StringReader(srcString);
        TokenStream tokenStream = analyzer.tokenStream(null, sr);

        //return string buffer  
        StringBuffer returnBuffer = new StringBuffer();
        String preContextBlock = ""; //previous text block

        //highlight:  [preContextBlock] + <b> + [token] + </b>
        for (Token t = tokenStream.next(); t != null; t = tokenStream.next()) {
            preContextBlock = getContext(prevEnd, t.startOffset());
            returnBuffer.append(preContextBlock);

            //append highlight string
            returnBuffer.append(highlightPrefix);

            for (int i = t.startOffset(); i < t.endOffset(); i++) {
                returnBuffer.append(srcBuffer[i]);
            }

            returnBuffer.append(highlightSuffix);

            //record current offset
            prevEnd = t.endOffset();

            if (returnBuffer.length() > maxReturnSize) {
                break;
            }
        }

        tokenStream.close();

        //no highlight token find, return first maxReturnSize of string[]
        if (returnBuffer.length() == 0) {
            if (srcLength > maxReturnSize) {
                returnBuffer.append(srcBuffer, 0, maxReturnSize);
            } else {
                returnBuffer.append(srcBuffer, 0, srcLength);
            }

            return returnBuffer.toString();
        }

        //expand return string to MaxReturn
        while ((returnBuffer.length() < maxReturnSize) && (prevEnd < srcLength)) {
            returnBuffer.append(srcBuffer[prevEnd]);
            prevEnd++;
        }

        return returnBuffer.toString();
    } catch (Exception e) {
        e.printStackTrace();

        //return with original value
        return "";
    }
}

From source file:NewsIR_search.TRECQuery.java

/**
 * Returns the content of the 'queryField' from the query text
 * @param analyzer/*from   w  w  w.  j  a  va  2  s  .com*/
 * @param queryField
 * @return (String) The content of the field
 * @throws Exception 
 */
public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}