List of usage examples for org.apache.lucene.index Terms hasOffsets
public abstract boolean hasOffsets();
From source file:com.twentyn.patentSearch.DocumentSearch.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("Starting up..."); System.out.flush();// w ww. j av a 2s . co m Options opts = new Options(); opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read") .build()); opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build()); opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build()); opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build()); opts.addOption( Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build()); opts.addOption(Option.builder("l").longOpt("list-file").hasArg() .desc("A file containing a list of queries to run in sequence").build()); opts.addOption( Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build()); opts.addOption(Option.builder("d").longOpt("dump").hasArg() .desc("Dump terms in the document index for a specified field").build()); opts.addOption( Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build()); opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg() .desc("The index of the InChI field if an input TSV is specified.").build()); opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg() .desc("The index of the chemical synonym field if an input TSV is specified.").build()); HelpFormatter helpFormatter = new HelpFormatter(); CommandLineParser cmdLineParser = new DefaultParser(); CommandLine cmdLine = null; try { cmdLine = cmdLineParser.parse(opts, args); } catch (ParseException e) { System.out.println("Caught exception when parsing command line: " + e.getMessage()); helpFormatter.printHelp("DocumentIndexer", opts); System.exit(1); } if (cmdLine.hasOption("help")) { helpFormatter.printHelp("DocumentIndexer", opts); System.exit(0); } if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field") && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) { System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}"); helpFormatter.printHelp("DocumentIndexer", opts); System.exit(1); } if (cmdLine.hasOption("verbose")) { // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2 LoggerContext ctx = (LoggerContext) LogManager.getContext(false); Configuration ctxConfig = ctx.getConfiguration(); LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME); logConfig.setLevel(Level.DEBUG); ctx.updateLoggers(); LOGGER.debug("Verbose logging enabled"); } ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(SerializationFeature.INDENT_OUTPUT); objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY); LOGGER.info("Opening index at " + cmdLine.getOptionValue("index")); try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath()); IndexReader indexReader = DirectoryReader.open(indexDir);) { if (cmdLine.hasOption("enumerate")) { /* Enumerate all documents in the index. * With help from * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index */ for (int i = 0; i < indexReader.maxDoc(); i++) { Document doc = indexReader.document(i); LOGGER.info("Doc " + i + ":"); LOGGER.info(doc); } } else if (cmdLine.hasOption("dump")) { /* Dump indexed terms for a specific field. * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */ Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump")); LOGGER.info("Has positions: " + terms.hasPositions()); LOGGER.info("Has offsets: " + terms.hasOffsets()); LOGGER.info("Has freqs: " + terms.hasFreqs()); LOGGER.info("Stats: " + terms.getStats()); LOGGER.info(terms); TermsEnum termsEnum = terms.iterator(); BytesRef br = null; while ((br = termsEnum.next()) != null) { LOGGER.info(" " + br.utf8ToString()); } } else { IndexSearcher searcher = new IndexSearcher(indexReader); String field = cmdLine.getOptionValue("field"); List<Pair<String, String>> queries = null; if (cmdLine.hasOption("query")) { queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query"))); } else if (cmdLine.hasOption("list-file")) { if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) { LOGGER.error("Must specify both inchi-field and synonym-field when using list-file."); System.exit(1); } Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field")); Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field")); queries = new LinkedList<>(); BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file"))); String line; while ((line = r.readLine()) != null) { line = line.trim(); if (!line.isEmpty()) { // TODO: use a proper TSV reader; this is intentionally terrible as is. String[] fields = line.split("\t"); queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField])); } } r.close(); } if (queries == null || queries.size() == 0) { LOGGER.error("Found no queries to run."); return; } List<SearchResult> searchResults = new ArrayList<>(queries.size()); for (Pair<String, String> queryPair : queries) { String inchi = queryPair.getLeft(); String rawQueryString = queryPair.getRight(); /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities * as query directives, which is not what we want at all. Phrase queries seem to work adequately * with the analyzer we're currently using. */ String queryString = rawQueryString.trim().toLowerCase(); String[] parts = queryString.split("\\s+"); PhraseQuery query = new PhraseQuery(); for (String p : parts) { query.add(new Term(field, p)); } LOGGER.info("Running query: " + query.toString()); BooleanQuery bq = new BooleanQuery(); bq.add(query, BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD); LOGGER.info(" Full query: " + bq.toString()); TopDocs topDocs = searcher.search(bq, 100); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs.length == 0) { LOGGER.info("Search returned no results."); } List<ResultDocument> results = new ArrayList<>(scoreDocs.length); for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = scoreDocs[i]; Document doc = indexReader.document(scoreDoc.doc); LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": " + doc.get("id") + ", " + doc.get("title")); results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"), doc.get("id"), null)); } LOGGER.info("----- Done with query " + query.toString()); // TODO: reduce memory usage when not writing results to an output file. searchResults.add(new SearchResult(inchi, rawQueryString, bq, results)); } if (cmdLine.hasOption("output")) { try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) { writer.write(objectMapper.writeValueAsString(searchResults)); } } } } }
From source file:org.apache.blur.lucene.warmup.IndexWarmup.java
License:Apache License
public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context) throws IOException { Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>(); if (atomicReader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) atomicReader; Directory directory = segmentReader.directory(); if (!(directory instanceof TraceableDirectory)) { LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory, context);/*from w ww .j a va2 s. com*/ return results; } IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize); String fileName = getSampleFileName(segmentReader.getSegmentName()); List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>(); if (directory.fileExists(fileName)) { IndexInput input = directory.openInput(fileName, IOContext.READONCE); segmentTraces = read(input); input.close(); } else { Fields fields = atomicReader.fields(); for (String field : fields) { LOG.debug("Context [{1}] sampling field [{0}].", field, context); Terms terms = fields.terms(field); boolean hasOffsets = terms.hasOffsets(); boolean hasPayloads = terms.hasPayloads(); boolean hasPositions = terms.hasPositions(); tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets); IndexTracerResult result = tracer.runTrace(terms); segmentTraces.add(result); } if (_isClosed.get()) { LOG.info("Context [{0}] index closed", context); return null; } IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT); write(segmentTraces, output); output.close(); } results.put(segmentReader.getSegmentName(), segmentTraces); } return results; }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return;/* w w w . ja v a 2s.co m*/ } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, curentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void initValues(Terms curTerms, DocsAndPositionsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { curentPositions[j] = nextPos; }/* www. j a v a 2 s . c o m*/ if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { curentPositions = ArrayUtil.grow(curentPositions, termFreq); }// ww w . j a v a 2s . c o m if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
From source file:org.elasticsearch.action.termvector.TermVectorWriter.java
License:Apache License
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException { int numFieldsWritten = 0; TermsEnum iterator = null;//from w w w . j a v a2 s . c o m DocsAndPositionsEnum docsAndPosEnum = null; DocsEnum docsEnum = null; TermsEnum topLevelIterator = null; for (String field : termVectorsByField) { if ((selectedFields != null) && (!selectedFields.contains(field))) { continue; } Terms fieldTermVector = termVectorsByField.terms(field); Terms topLevelTerms = topLevelFields.terms(field); topLevelIterator = topLevelTerms.iterator(topLevelIterator); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); startField(field, fieldTermVector.size(), positions, offsets, payloads); if (flags.contains(Flag.FieldStatistics)) { writeFieldStatistics(topLevelTerms); } iterator = fieldTermVector.iterator(iterator); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the // current field // get the doc frequency BytesRef term = iterator.term(); boolean foundTerm = topLevelIterator.seekExact(term); assert (foundTerm); startTerm(term); if (flags.contains(Flag.TermStatistics)) { writeTermStatistics(topLevelIterator); } if (useDocsAndPos) { // given we have pos or offsets docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads); } else { // if we do not have the positions stored, we need to // get the frequency from a DocsEnum. docsEnum = writeTermWithDocsOnly(iterator, docsEnum); } } numFieldsWritten++; } response.setTermVectorField(output); response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics))); }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return;/* ww w. ja v a 2s . com*/ } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, currentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; }/*from w w w.j av a2s.com*/ if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { currentPositions = ArrayUtil.grow(currentPositions, termFreq); }/* ww w . j a v a2s . co m*/ if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java
License:Apache License
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException { int numFieldsWritten = 0; PostingsEnum docsAndPosEnum = null;//w w w . j a va 2s. co m PostingsEnum docsEnum = null; boolean hasScores = termVectorsFilter != null; for (String field : termVectorsByField) { if ((selectedFields != null) && (!selectedFields.contains(field))) { continue; } Terms fieldTermVector = termVectorsByField.terms(field); Terms topLevelTerms = topLevelFields.terms(field); // if no terms found, take the retrieved term vector fields for stats if (topLevelTerms == null) { topLevelTerms = fieldTermVector; } TermsEnum topLevelIterator = topLevelTerms.iterator(); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); long termsSize = fieldTermVector.size(); if (hasScores) { termsSize = Math.min(termsSize, termVectorsFilter.size(field)); } startField(field, termsSize, positions, offsets, payloads); if (flags.contains(Flag.FieldStatistics)) { if (dfs != null) { writeFieldStatistics(dfs.fieldStatistics().get(field)); } else { writeFieldStatistics(topLevelTerms); } } TermsEnum iterator = fieldTermVector.iterator(); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the current field BytesRef termBytesRef = iterator.term(); Term term = new Term(field, termBytesRef); // with filtering we only keep the best terms if (hasScores && !termVectorsFilter.hasScoreTerm(term)) { continue; } startTerm(termBytesRef); if (flags.contains(Flag.TermStatistics)) { // get the doc frequency if (dfs != null) { final TermStatistics statistics = dfs.termStatistics().get(term); writeTermStatistics( statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics); } else { boolean foundTerm = topLevelIterator.seekExact(termBytesRef); if (foundTerm) { writeTermStatistics(topLevelIterator); } else { writeTermStatistics(new TermStatistics(termBytesRef, 0, 0)); } } } if (useDocsAndPos) { // given we have pos or offsets docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads); } else { // if we do not have the positions stored, we need to // get the frequency from a PostingsEnum. docsEnum = writeTermWithDocsOnly(iterator, docsEnum); } if (hasScores) { writeScoreTerm(termVectorsFilter.getScoreTerm(term)); } } numFieldsWritten++; } response.setTermVectorsField(output); response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores)); }