Example usage for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java

License:Apache License

private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException {
    if (null == ts)
        return;/*from   www  . j a  v  a2 s  .  c o m*/
    TokenStream stream = ts.stream;
    if (null == stream)
        return;

    DocTerms terms = doc.terms;
    if (null == doc.terms) {
        terms = new DocTerms();
        doc.terms = terms;
    }

    String token = null;
    int offset = 0;
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        token = termA.toString();
        offset = offsetA.startOffset();
        Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset);
        terms.getTermList().add(term);
    }
    stream.close();
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public final void mapFreeTextBitset(final Field fld, final Context context)
        throws IOException, InterruptedException {

    terms.clear();//from  w ww.j a  v  a 2 s. c  om
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;
    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);
        String last2 = null;
        String last1 = null;
        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();

            if (0 == termWord.length())
                continue;

            appender.delete(0, appender.capacity());

            /**
             * Row Key is mergeidFIELDwordhashStr
             */
            boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0);
            String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name;

            rowKeyP1 = rowKeyPrefix + termWord;
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();

            appender.setLength(0);
            rowVal = appender.append(incrementalIdSeekPosition).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));

            if (!fld.isBiWord && !fld.isTriWord)
                continue;

            /**
             * Do Three phrase word
             */
            if (null != last2) {
                appender.setLength(0);

                rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ')
                        .append(termWord).append(' ').append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            /**
             * Do Two phrase word
             */
            if (null != last1) {

                appender.setLength(0);
                rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ')
                        .append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            last2 = last1;
            last1 = termWord;

        }
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Error While tokenizing : " + e.getMessage());
    } finally {
        try {
            if (null != stream)
                stream.close();
        } catch (Exception ex) {
            IdSearchLog.l.warn("Error during Tokenizer Stream closure");
        }
    }
}

From source file:com.bizosys.unstructured.IndexWriter.java

License:Apache License

/**
 * Find the last offset./*from  w  w  w.ja  va  2s .c o m*/
 * Find each term offset
 * 
 * @param stream
 * @param docId
 * @param docType
 * @param fieldType
 * @param fieldBoost
 * @param codecs
 * @param uniqueTokens
 * @throws IOException
 */
private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType,
        Map<String, IndexRow> uniqueTokens) throws IOException {

    String token = null;
    int curoffset = 0;
    int lastoffset = 0;
    int position = -1;

    StringBuilder sb = new StringBuilder();
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {

        token = termA.toString();
        curoffset = offsetA.endOffset();

        if (lastoffset != curoffset)
            position++;
        lastoffset = curoffset;

        String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter);
        sb.delete(0, sb.capacity());

        if (uniqueTokens.containsKey(key)) {
            IndexRow existingRow = uniqueTokens.get(key);
            existingRow.set(curoffset, position);
            existingRow.occurance++;
        } else {
            IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position);
            if (null != filter)
                row.docMeta = filter;
            uniqueTokens.put(key, row);
        }
    }
    stream.end();
    stream.close();

    for (IndexRow row : uniqueTokens.values())
        cachedIndex.add(row);
}

From source file:com.chriscx.stem.Stem.java

public String evaluate(BufferedReader input) {
    if (input == null) {
        return null;
    }/*from  w w w  .  j a  v a2  s  .co m*/

    CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true);
    String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi "
            + "autre autres aux auxquelles auxquels avait avant avec c car ce "
            + "ceci cela celle celles celui cependant certain certaine certaines "
            + "certains ces cet cette ceux chez ci combien comme comment "
            + "concernant contre d dans de debout dedans dehors del depuis "
            + "derrire des dsormais desquelles desquels devers devra doit "
            + "donc dont du duquel durant ds elle elles en entre environ est"
            + " et etc eu eux except hormis hors hlas hui il ils j je jusqu "
            + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs "
            + "lorsque lui l ma mais malgr me merci mes mien mienne miennes "
            + "miens moins mon moyennant mme mmes n ne ni non nos notre nous "
            + "nanmoins ntre ntres on ont ou outre o par parmi partant pas "
            + "pass pendant plein plus plusieurs pour pourquoi proche prs "
            + "puisque qu quand que quel quelle quelles quels qui quoi quoique"
            + " revoici revoil s sa sans sauf se selon seront ses si sien "
            + "sienne siennes siens sinon soi soit son sont sous suivant sur "
            + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes"
            + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre ";
    String[] stopWordsTab = stopWords.split(" ");
    for (String word : stopWordsTab) {
        stopWordsSet.add(word);
    }

    Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet);

    result = "";
    try {
        String line = input.readLine();

        line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", "");
        line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
        line = line.replaceAll("(_|-)+", "");
        line = line.replaceAll("(\\n|\\r|\\t)+", "");
        line = line.replaceAll("(?![\\._])\\p{P}", "");
        while (line != null) {

            TokenStream stream = analyzer.tokenStream(null, line);
            stream.reset();
            while (stream.incrementToken()) {
                String wordset = stream.getAttribute(CharTermAttribute.class).toString();
                wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
                result += wordset + " ";
            }
            result += "\n";
            stream.close();
            line = input.readLine();
        }

        input.close();
        return result;
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {/*from   w ww  .  ja  v a2 s .c om*/
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.flaptor.hounder.classifier.util.DocumentParser.java

License:Apache License

/**
 * Transforms the document in an array of tokens and counts the number of 
 * ocurrencies of each token./*  ww  w . jav  a 2 s .c  o  m*/
 * @param doc the document represented as a string
 * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return.
 *  Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the returned
 *  map will contain values for each fo the following: t1, t2, t1_t2, t2_t3 
 *  If maxTuple <1 then maxTuple=1.
 * @return a map that binds each token with the count of ocurrencies within
 *  the document
 * @see {@link TupleTokenizer}{@link #parse(String, int)}
 * The map should be '<String,int>'. But int can't be inserted to a Map, and
 * Integer is unmodifiable. So this awful hack uses an int[] to be able to
 * add an int and change it's value easily during the calculation.
 */
public static Map<String, int[]> parse(String doc, int maxTuple) {

    // TODO: Use Integer instead int[].
    Map<String, int[]> tokenCount = new HashMap<String, int[]>();

    // TODO: Decouple from lucene, allow the analyzer to be configurable.
    // TODO: Verifiy that it is necessary to create a new analyzer instance each time.
    Analyzer analyzer = new StandardAnalyzer();
    Reader docReader = new StringReader(doc);
    TokenStream tokenStream = analyzer.tokenStream(null, docReader);

    try {
        if (1 < maxTuple) {
            tokenStream = new TupleTokenizer(tokenStream, maxTuple);
        }
        Token token = new Token();
        while ((token = tokenStream.next(token)) != null) {
            String term = TokenUtil.termText(token);
            int[] count = tokenCount.get(term);
            if (count == null) {
                count = new int[] { 0 };
                tokenCount.put(term, count);
            } else {
                count[0]++;
            }
        }
    } catch (IOException e) {
        System.err.println("parse: couldn't parse document " + e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            System.err.println("close: " + e);
        }
    }

    return tokenCount;
}

From source file:com.flaptor.indextank.query.IndexEngineParser.java

License:Apache License

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
    final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
    final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
    final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

    return new AbstractIterator<AToken>() {
        int currentPosition = 0;

        @Override// w  ww. ja  v a2s.  c o  m
        protected AToken computeNext() {
            try {
                if (!tkstream.incrementToken()) {
                    tkstream.end();
                    tkstream.close();
                    return endOfData();
                }
            } catch (IOException e) {
                //This should never happen, as the reader is a StringReader
            }
            //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class);
            currentPosition += posIncrAttribute.getPositionIncrement();
            final int position = currentPosition;
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            final String text = termAtt.term();
            return new AToken() {
                @Override
                public String getText() {
                    return text; //luceneTk.term();
                }

                @Override
                public int getPosition() {
                    return position; //luceneTk.getPositionIncrement();
                }

                @Override
                public int getStartOffset() {
                    return startOffset;
                }

                @Override
                public int getEndOffset() {
                    return endOffset;
                }
            };
        }
    };

}

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }/*from w  w w  .j a  va 2s  . co m*/

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private String analyzeQuery(String query, Analyzer analyzer) {

    if (analyzer != null && query != null && query.length() > 0) {
        TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query));

        StringBuilder newQueryB = new StringBuilder();
        try {/*from  w  w w .j a  v a  2s . c o m*/
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
                // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
                // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

                newQueryB.append(term.toString());
                newQueryB.append(' ');

            }
            tokenStream.end();
            return newQueryB.toString().trim();

        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {
                throw new RuntimeException("uncaught exception in synonym processing", e);
            }
        }
    }

    return query;

}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * //from w  ww . j av a2  s .  c o  m
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}