List of usage examples for org.apache.lucene.index Terms hasPositions
public abstract boolean hasPositions();
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { if (leftTerms == null || rightTerms == null) { assertNull(leftTerms);/*from www . j ava 2 s . c o m*/ assertNull(rightTerms); return; } assertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions(); TermsEnum leftTermsEnum = leftTerms.iterator(); TermsEnum rightTermsEnum = rightTerms.iterator(); assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHavePositions); assertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHavePositions); } } } }
From source file:com.twentyn.patentSearch.DocumentSearch.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("Starting up..."); System.out.flush();/*from ww w . ja v a 2s . c om*/ Options opts = new Options(); opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read") .build()); opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build()); opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build()); opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build()); opts.addOption( Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build()); opts.addOption(Option.builder("l").longOpt("list-file").hasArg() .desc("A file containing a list of queries to run in sequence").build()); opts.addOption( Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build()); opts.addOption(Option.builder("d").longOpt("dump").hasArg() .desc("Dump terms in the document index for a specified field").build()); opts.addOption( Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build()); opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg() .desc("The index of the InChI field if an input TSV is specified.").build()); opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg() .desc("The index of the chemical synonym field if an input TSV is specified.").build()); HelpFormatter helpFormatter = new HelpFormatter(); CommandLineParser cmdLineParser = new DefaultParser(); CommandLine cmdLine = null; try { cmdLine = cmdLineParser.parse(opts, args); } catch (ParseException e) { System.out.println("Caught exception when parsing command line: " + e.getMessage()); helpFormatter.printHelp("DocumentIndexer", opts); System.exit(1); } if (cmdLine.hasOption("help")) { helpFormatter.printHelp("DocumentIndexer", opts); System.exit(0); } if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field") && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) { System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}"); helpFormatter.printHelp("DocumentIndexer", opts); System.exit(1); } if (cmdLine.hasOption("verbose")) { // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2 LoggerContext ctx = (LoggerContext) LogManager.getContext(false); Configuration ctxConfig = ctx.getConfiguration(); LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME); logConfig.setLevel(Level.DEBUG); ctx.updateLoggers(); LOGGER.debug("Verbose logging enabled"); } ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(SerializationFeature.INDENT_OUTPUT); objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY); LOGGER.info("Opening index at " + cmdLine.getOptionValue("index")); try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath()); IndexReader indexReader = DirectoryReader.open(indexDir);) { if (cmdLine.hasOption("enumerate")) { /* Enumerate all documents in the index. * With help from * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index */ for (int i = 0; i < indexReader.maxDoc(); i++) { Document doc = indexReader.document(i); LOGGER.info("Doc " + i + ":"); LOGGER.info(doc); } } else if (cmdLine.hasOption("dump")) { /* Dump indexed terms for a specific field. * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */ Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump")); LOGGER.info("Has positions: " + terms.hasPositions()); LOGGER.info("Has offsets: " + terms.hasOffsets()); LOGGER.info("Has freqs: " + terms.hasFreqs()); LOGGER.info("Stats: " + terms.getStats()); LOGGER.info(terms); TermsEnum termsEnum = terms.iterator(); BytesRef br = null; while ((br = termsEnum.next()) != null) { LOGGER.info(" " + br.utf8ToString()); } } else { IndexSearcher searcher = new IndexSearcher(indexReader); String field = cmdLine.getOptionValue("field"); List<Pair<String, String>> queries = null; if (cmdLine.hasOption("query")) { queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query"))); } else if (cmdLine.hasOption("list-file")) { if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) { LOGGER.error("Must specify both inchi-field and synonym-field when using list-file."); System.exit(1); } Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field")); Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field")); queries = new LinkedList<>(); BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file"))); String line; while ((line = r.readLine()) != null) { line = line.trim(); if (!line.isEmpty()) { // TODO: use a proper TSV reader; this is intentionally terrible as is. String[] fields = line.split("\t"); queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField])); } } r.close(); } if (queries == null || queries.size() == 0) { LOGGER.error("Found no queries to run."); return; } List<SearchResult> searchResults = new ArrayList<>(queries.size()); for (Pair<String, String> queryPair : queries) { String inchi = queryPair.getLeft(); String rawQueryString = queryPair.getRight(); /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities * as query directives, which is not what we want at all. Phrase queries seem to work adequately * with the analyzer we're currently using. */ String queryString = rawQueryString.trim().toLowerCase(); String[] parts = queryString.split("\\s+"); PhraseQuery query = new PhraseQuery(); for (String p : parts) { query.add(new Term(field, p)); } LOGGER.info("Running query: " + query.toString()); BooleanQuery bq = new BooleanQuery(); bq.add(query, BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD); LOGGER.info(" Full query: " + bq.toString()); TopDocs topDocs = searcher.search(bq, 100); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs.length == 0) { LOGGER.info("Search returned no results."); } List<ResultDocument> results = new ArrayList<>(scoreDocs.length); for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = scoreDocs[i]; Document doc = indexReader.document(scoreDoc.doc); LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": " + doc.get("id") + ", " + doc.get("title")); results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"), doc.get("id"), null)); } LOGGER.info("----- Done with query " + query.toString()); // TODO: reduce memory usage when not writing results to an output file. searchResults.add(new SearchResult(inchi, rawQueryString, bq, results)); } if (cmdLine.hasOption("output")) { try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) { writer.write(objectMapper.writeValueAsString(searchResults)); } } } } }
From source file:nl.inl.blacklab.search.SearcherImpl.java
License:Apache License
@Override public void getCharacterOffsets(int doc, String fieldName, int[] startsOfWords, int[] endsOfWords, boolean fillInDefaultsIfNotFound) { if (startsOfWords.length == 0) return; // nothing to do try {// w w w . j a v a 2 s . c o m // Determine lowest and highest word position we'd like to know something about. // This saves a little bit of time for large result sets. int minP = -1, maxP = -1; int numStarts = startsOfWords.length; int numEnds = endsOfWords.length; for (int i = 0; i < numStarts; i++) { if (startsOfWords[i] < minP || minP == -1) minP = startsOfWords[i]; if (startsOfWords[i] > maxP) maxP = startsOfWords[i]; } for (int i = 0; i < numEnds; i++) { if (endsOfWords[i] < minP || minP == -1) minP = endsOfWords[i]; if (endsOfWords[i] > maxP) maxP = endsOfWords[i]; } if (minP < 0 || maxP < 0) throw new RuntimeException("Can't determine min and max positions"); String fieldPropName = ComplexFieldUtil.mainPropertyOffsetsField(indexStructure, fieldName); org.apache.lucene.index.Terms terms = reader.getTermVector(doc, fieldPropName); if (terms == null) throw new IllegalArgumentException( "Field " + fieldPropName + " in doc " + doc + " has no term vector"); if (!terms.hasPositions()) throw new IllegalArgumentException( "Field " + fieldPropName + " in doc " + doc + " has no character postion information"); //int lowestPos = -1, highestPos = -1; int lowestPosFirstChar = -1, highestPosLastChar = -1; int total = numStarts + numEnds; boolean[] done = new boolean[total]; // NOTE: array is automatically initialized to zeroes! int found = 0; // Iterate over terms TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { PostingsEnum dpe = termsEnum.postings(null, null, PostingsEnum.POSITIONS); // Iterate over docs containing this term (NOTE: should be only one doc!) while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // Iterate over positions of this term in this doc int positionsRead = 0; int numberOfPositions = dpe.freq(); while (positionsRead < numberOfPositions) { int position = dpe.nextPosition(); if (position == -1) break; positionsRead++; // Keep track of the lowest and highest char pos, so // we can fill in the character positions we didn't find int startOffset = dpe.startOffset(); if (startOffset < lowestPosFirstChar || lowestPosFirstChar == -1) { lowestPosFirstChar = startOffset; } int endOffset = dpe.endOffset(); if (endOffset > highestPosLastChar) { highestPosLastChar = endOffset; } // We've calculated the min and max word positions in advance, so // we know we can skip this position if it's outside the range we're interested in. // (Saves a little time for large result sets) if (position < minP || position > maxP) { continue; } for (int m = 0; m < numStarts; m++) { if (!done[m] && position == startsOfWords[m]) { done[m] = true; startsOfWords[m] = startOffset; found++; } } for (int m = 0; m < numEnds; m++) { if (!done[numStarts + m] && position == endsOfWords[m]) { done[numStarts + m] = true; endsOfWords[m] = endOffset; found++; } } // NOTE: we might be tempted to break here if found == total, // but that would foul up our calculation of highestPosLastChar and // lowestPosFirstChar. } } } if (found < total) { if (!fillInDefaultsIfNotFound) throw new RuntimeException("Could not find all character offsets!"); if (lowestPosFirstChar < 0 || highestPosLastChar < 0) throw new RuntimeException("Could not find default char positions!"); for (int m = 0; m < numStarts; m++) { if (!done[m]) startsOfWords[m] = lowestPosFirstChar; } for (int m = 0; m < numEnds; m++) { if (!done[numStarts + m]) endsOfWords[m] = highestPosLastChar; } } } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } }
From source file:org.apache.blur.lucene.warmup.IndexWarmup.java
License:Apache License
public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context) throws IOException { Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>(); if (atomicReader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) atomicReader; Directory directory = segmentReader.directory(); if (!(directory instanceof TraceableDirectory)) { LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory, context);/*w w w.ja va2s. c o m*/ return results; } IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize); String fileName = getSampleFileName(segmentReader.getSegmentName()); List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>(); if (directory.fileExists(fileName)) { IndexInput input = directory.openInput(fileName, IOContext.READONCE); segmentTraces = read(input); input.close(); } else { Fields fields = atomicReader.fields(); for (String field : fields) { LOG.debug("Context [{1}] sampling field [{0}].", field, context); Terms terms = fields.terms(field); boolean hasOffsets = terms.hasOffsets(); boolean hasPayloads = terms.hasPayloads(); boolean hasPositions = terms.hasPositions(); tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets); IndexTracerResult result = tracer.runTrace(terms); segmentTraces.add(result); } if (_isClosed.get()) { LOG.info("Context [{0}] index closed", context); return null; } IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT); write(segmentTraces, output); output.close(); } results.put(segmentReader.getSegmentName(), segmentTraces); } return results; }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return;// ww w . ja v a 2s.c o m } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, curentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void initValues(Terms curTerms, DocsAndPositionsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { curentPositions[j] = nextPos; }/*w w w. j a v a 2 s . c o m*/ if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { curentPositions = ArrayUtil.grow(curentPositions, termFreq); }/*from w w w.j a va 2s. c om*/ if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
From source file:org.elasticsearch.action.termvector.TermVectorWriter.java
License:Apache License
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException { int numFieldsWritten = 0; TermsEnum iterator = null;/*from w w w . ja va 2 s .c om*/ DocsAndPositionsEnum docsAndPosEnum = null; DocsEnum docsEnum = null; TermsEnum topLevelIterator = null; for (String field : termVectorsByField) { if ((selectedFields != null) && (!selectedFields.contains(field))) { continue; } Terms fieldTermVector = termVectorsByField.terms(field); Terms topLevelTerms = topLevelFields.terms(field); topLevelIterator = topLevelTerms.iterator(topLevelIterator); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); startField(field, fieldTermVector.size(), positions, offsets, payloads); if (flags.contains(Flag.FieldStatistics)) { writeFieldStatistics(topLevelTerms); } iterator = fieldTermVector.iterator(iterator); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the // current field // get the doc frequency BytesRef term = iterator.term(); boolean foundTerm = topLevelIterator.seekExact(term); assert (foundTerm); startTerm(term); if (flags.contains(Flag.TermStatistics)) { writeTermStatistics(topLevelIterator); } if (useDocsAndPos) { // given we have pos or offsets docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads); } else { // if we do not have the positions stored, we need to // get the frequency from a DocsEnum. docsEnum = writeTermWithDocsOnly(iterator, docsEnum); } } numFieldsWritten++; } response.setTermVectorField(output); response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics))); }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return;// ww w . ja v a 2 s . c o m } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, currentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; }/*from w w w. j a v a2s . com*/ if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }