Example usage for org.apache.lucene.analysis.tokenattributes FlagsAttribute getFlags

List of usage examples for org.apache.lucene.analysis.tokenattributes FlagsAttribute getFlags

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.tokenattributes FlagsAttribute getFlags.

Prototype

public int getFlags();

Source Link

Document

Get the bitset for any bits that have been set.

Usage

From source file:com.jaeksoft.searchlib.analysis.TokenTerm.java

License:Open Source License

public TokenTerm(final CharTermAttribute termAtt, final PositionIncrementAttribute posIncrAtt,
        final OffsetAttribute offsetAtt, final TypeAttribute typeAtt, final FlagsAttribute flagsAtt) {
    this.term = termAtt != null ? termAtt.toString() : null;
    this.start = offsetAtt != null ? offsetAtt.startOffset() : 0;
    this.end = offsetAtt != null ? offsetAtt.endOffset() : 0;
    this.increment = posIncrAtt != null ? posIncrAtt.getPositionIncrement() : 0;
    this.type = typeAtt != null ? typeAtt.type() : null;
    this.flags = flagsAtt != null ? flagsAtt.getFlags() : 0;
}

From source file:com.qwazr.search.index.TermDefinition.java

License:Apache License

TermDefinition(CharTermAttribute charTermAttr, FlagsAttribute flagsAttr, OffsetAttribute offsetAttr,
        PositionIncrementAttribute posIncAttr, PositionLengthAttribute posLengthAttr, TypeAttribute typeAttr,
        KeywordAttribute keywordAttr) {// w w w  .  jav  a  2s .  c  o  m
    char_term = charTermAttr == null ? null : charTermAttr.toString();
    if (offsetAttr != null) {
        start_offset = offsetAttr.startOffset();
        end_offset = offsetAttr.endOffset();
    } else {
        start_offset = null;
        end_offset = null;
    }
    flags = flagsAttr == null ? null : flagsAttr.getFlags();
    position_increment = posIncAttr == null ? null : posIncAttr.getPositionIncrement();
    position_length = posLengthAttr == null ? null : posLengthAttr.getPositionLength();
    type = typeAttr == null ? null : typeAttr.type();
    is_keyword = keywordAttr == null ? null : keywordAttr.isKeyword();
}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {/* w  ww .  j  av  a 2 s. c o m*/
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:indexing.ReviewTextAnalyzer.java

License:Open Source License

/**
 * @param args/*ww w  . j  av a  2 s  .c o  m*/
 */
public static void main(String[] args) {
    ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer());
    String[] filenames = { "review.txt" };
    for (String filename : filenames) {
        try {
            TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename));

            TermAttribute output_term = tokstr.addAttribute(TermAttribute.class);
            TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class);
            FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class);
            PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class);

            int review_id = r.indexer.theReviewId.get() + 1;
            r.indexer.theReviewId.set(review_id);
            r.indexer.theStats.setCurrent(review_id, 10);

            while (tokstr.incrementToken()) {

                Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(),
                        new ReviewTermPayload(output_payload.getPayload()));

                System.out.print(current_token);

                if (current_token.isDelim(false)) {
                    System.out.println();
                }
                if (current_token.isDelim(true)) {
                    System.out.println("..................................................................\n");
                }
            }

            System.out.println();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println(
                "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n");
    }

    return;
}

From source file:org.apache.solr.handler.component.SpellCheckComponent.java

License:Apache License

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<Token>();
    assert analyzer != null;
    TokenStream ts = analyzer.tokenStream("", q);
    try {//from   ww w . j a v a 2 s.c om
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.spelling.SimpleQueryConverter.java

License:Apache License

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<Token>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);

    TokenStream ts = null;/*from w  ww.j  ava  2s. c o m*/
    try {
        ts = analyzer.tokenStream("", origQuery);
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        ts.reset();

        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java

License:Apache License

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {//  www  .j av  a 2s .  c  o m
    final FieldNameSelector srcSelector = getSourceSelector();
    return new UpdateRequestProcessor(next) {
        private final NLPNERTaggerOp nerTaggerOp;
        private Analyzer analyzer = null;
        {
            try {
                nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
                FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
                if (fieldType == null) {
                    throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType
                            + "' not found in the schema.");
                }
                analyzer = fieldType.getIndexAnalyzer();
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            // Destination may be regex replace string, or "{EntityType}" replaced by
            // each entity's type, both of which can cause multiple output fields.
            Map<String, SolrInputField> destMap = new HashMap<>();

            // preserve initial values
            for (final String fname : doc.getFieldNames()) {
                if (!srcSelector.shouldMutate(fname))
                    continue;

                Collection<Object> srcFieldValues = doc.getFieldValues(fname);
                if (srcFieldValues == null || srcFieldValues.isEmpty())
                    continue;

                String resolvedDest = dest;

                if (pattern != null) {
                    Matcher matcher = pattern.matcher(fname);
                    if (matcher.find()) {
                        resolvedDest = matcher.replaceAll(dest);
                    } else {
                        log.debug("srcSelector.shouldMutate(\"{}\") returned true, "
                                + "but replacement pattern did not match, field skipped.", fname);
                        continue;
                    }
                }

                for (Object val : srcFieldValues) {
                    for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
                        SolrInputField destField = null;
                        String entityName = entity.first();
                        String entityType = entity.second();
                        resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
                        if (doc.containsKey(resolvedDest)) {
                            destField = doc.getField(resolvedDest);
                        } else {
                            SolrInputField targetField = destMap.get(resolvedDest);
                            if (targetField == null) {
                                destField = new SolrInputField(resolvedDest);
                            } else {
                                destField = targetField;
                            }
                        }
                        destField.addValue(entityName);

                        // put it in map to avoid concurrent modification...
                        destMap.put(resolvedDest, destField);
                    }
                }
            }

            for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
                doc.put(entry.getKey(), entry.getValue());
            }
            super.processAdd(cmd);
        }

        /** Using configured NER model, extracts (name, type) pairs from the given source field value */
        private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
            List<Pair<String, String>> entitiesWithType = new ArrayList<>();
            List<String> terms = new ArrayList<>();
            List<Integer> startOffsets = new ArrayList<>();
            List<Integer> endOffsets = new ArrayList<>();
            String fullText = srcFieldValue.toString();
            TokenStream tokenStream = analyzer.tokenStream("", fullText);
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
            FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
            tokenStream.reset();
            synchronized (nerTaggerOp) {
                while (tokenStream.incrementToken()) {
                    terms.add(termAtt.toString());
                    startOffsets.add(offsetAtt.startOffset());
                    endOffsets.add(offsetAtt.endOffset());
                    boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
                    if (endOfSentence) { // extract named entities one sentence at a time
                        extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets,
                                entitiesWithType);
                    }
                }
                tokenStream.end();
                tokenStream.close();
                if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
                    extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
                }
                nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
            }
            return entitiesWithType;
        }

        private void extractEntitiesFromSentence(String fullText, List<String> terms,
                List<Integer> startOffsets, List<Integer> endOffsets,
                List<Pair<String, String>> entitiesWithType) {
            for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
                String text = fullText.substring(startOffsets.get(span.getStart()),
                        endOffsets.get(span.getEnd() - 1));
                entitiesWithType.add(new Pair<>(text, span.getType()));
            }
            terms.clear();
            startOffsets.clear();
            endOffsets.clear();
        }
    };
}