List of usage examples for org.apache.lucene.analysis.tokenattributes FlagsAttribute getFlags
public int getFlags();
From source file:com.jaeksoft.searchlib.analysis.TokenTerm.java
License:Open Source License
public TokenTerm(final CharTermAttribute termAtt, final PositionIncrementAttribute posIncrAtt, final OffsetAttribute offsetAtt, final TypeAttribute typeAtt, final FlagsAttribute flagsAtt) { this.term = termAtt != null ? termAtt.toString() : null; this.start = offsetAtt != null ? offsetAtt.startOffset() : 0; this.end = offsetAtt != null ? offsetAtt.endOffset() : 0; this.increment = posIncrAtt != null ? posIncrAtt.getPositionIncrement() : 0; this.type = typeAtt != null ? typeAtt.type() : null; this.flags = flagsAtt != null ? flagsAtt.getFlags() : 0; }
From source file:com.qwazr.search.index.TermDefinition.java
License:Apache License
TermDefinition(CharTermAttribute charTermAttr, FlagsAttribute flagsAttr, OffsetAttribute offsetAttr, PositionIncrementAttribute posIncAttr, PositionLengthAttribute posLengthAttr, TypeAttribute typeAttr, KeywordAttribute keywordAttr) {// w w w . jav a 2s . c o m char_term = charTermAttr == null ? null : charTermAttr.toString(); if (offsetAttr != null) { start_offset = offsetAttr.startOffset(); end_offset = offsetAttr.endOffset(); } else { start_offset = null; end_offset = null; } flags = flagsAttr == null ? null : flagsAttr.getFlags(); position_increment = posIncAttr == null ? null : posIncAttr.getPositionIncrement(); position_length = posLengthAttr == null ? null : posLengthAttr.getPositionLength(); type = typeAttr == null ? null : typeAttr.type(); is_keyword = keywordAttr == null ? null : keywordAttr.isKeyword(); }
From source file:dependencies.ReviewDependencyAnalyzer.java
License:Open Source License
public ArrayList<ArrayList<Token>> getSentences(Reader reader) { try {/* w ww . j av a 2 s. c o m*/ // Send reader data through the analyzer TokenStream tokstr = reusableTokenStream("", reader); TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class); TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class); FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class); PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class); // Split the tokenstream returned by the analyzer into sentences. Convert each sentence // into a linked list of tokens ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>(); ArrayList<Token> current_sentence = new ArrayList<Token>(); while (tokstr.incrementToken()) { Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(), new ReviewTermPayload(tok_payload.getPayload())); current_sentence.add(current_token); // End of sentence reached. Add current sentence to the sentence list if (current_token.isDelim(true)) { if (current_sentence.size() > 1) { sentence_list.add(current_sentence); } current_sentence = new ArrayList<Token>(); } } // At the end of the token stream, if there is an incomplete sentence, add it to the // sentence list. // This case could occur when the last sentence of a given passage does not end with a // period or other sentence delimiter. if (!current_sentence.isEmpty()) { sentence_list.add(current_sentence); } return sentence_list; } catch (IOException e) { AppLogger.error.log(Level.SEVERE, "Error reading data from reader. Analyzing text for typed dependencies could not be completed"); return null; } }
From source file:indexing.ReviewTextAnalyzer.java
License:Open Source License
/** * @param args/*ww w . j av a 2 s .c o m*/ */ public static void main(String[] args) { ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer()); String[] filenames = { "review.txt" }; for (String filename : filenames) { try { TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename)); TermAttribute output_term = tokstr.addAttribute(TermAttribute.class); TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class); FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class); PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class); int review_id = r.indexer.theReviewId.get() + 1; r.indexer.theReviewId.set(review_id); r.indexer.theStats.setCurrent(review_id, 10); while (tokstr.incrementToken()) { Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(), new ReviewTermPayload(output_payload.getPayload())); System.out.print(current_token); if (current_token.isDelim(false)) { System.out.println(); } if (current_token.isDelim(true)) { System.out.println("..................................................................\n"); } } System.out.println(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println( "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n"); } return; }
From source file:org.apache.solr.handler.component.SpellCheckComponent.java
License:Apache License
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { Collection<Token> result = new ArrayList<Token>(); assert analyzer != null; TokenStream ts = analyzer.tokenStream("", q); try {//from ww w . j a v a 2 s.c om ts.reset(); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } ts.end(); return result; } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.spelling.SimpleQueryConverter.java
License:Apache License
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<Token>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); TokenStream ts = null;/*from w ww.j ava 2s. c o m*/ try { ts = analyzer.tokenStream("", origQuery); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
License:Apache License
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {// www .j av a 2s . c o m final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { private final NLPNERTaggerOp nerTaggerOp; private Analyzer analyzer = null; { try { nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile); FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType); if (fieldType == null) { throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema."); } analyzer = fieldType.getIndexAnalyzer(); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Destination may be regex replace string, or "{EntityType}" replaced by // each entity's type, both of which can cause multiple output fields. Map<String, SolrInputField> destMap = new HashMap<>(); // preserve initial values for (final String fname : doc.getFieldNames()) { if (!srcSelector.shouldMutate(fname)) continue; Collection<Object> srcFieldValues = doc.getFieldValues(fname); if (srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { log.debug("srcSelector.shouldMutate(\"{}\") returned true, " + "but replacement pattern did not match, field skipped.", fname); continue; } } for (Object val : srcFieldValues) { for (Pair<String, String> entity : extractTypedNamedEntities(val)) { SolrInputField destField = null; String entityName = entity.first(); String entityType = entity.second(); resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType); if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } destField.addValue(entityName); // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } } } for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } /** Using configured NER model, extracts (name, type) pairs from the given source field value */ private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException { List<Pair<String, String>> entitiesWithType = new ArrayList<>(); List<String> terms = new ArrayList<>(); List<Integer> startOffsets = new ArrayList<>(); List<Integer> endOffsets = new ArrayList<>(); String fullText = srcFieldValue.toString(); TokenStream tokenStream = analyzer.tokenStream("", fullText); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); tokenStream.reset(); synchronized (nerTaggerOp) { while (tokenStream.incrementToken()) { terms.add(termAtt.toString()); startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT); if (endOfSentence) { // extract named entities one sentence at a time extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } } tokenStream.end(); tokenStream.close(); if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls } return entitiesWithType; } private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets, List<Integer> endOffsets, List<Pair<String, String>> entitiesWithType) { for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) { String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1)); entitiesWithType.add(new Pair<>(text, span.getType())); } terms.clear(); startOffsets.clear(); endOffsets.clear(); } }; }