Example usage for org.apache.lucene.index Terms hasOffsets

List of usage examples for org.apache.lucene.index Terms hasOffsets

Introduction

In this page you can find the example usage for org.apache.lucene.index Terms hasOffsets.

Prototype

public abstract boolean hasOffsets();

Source Link

Document

Returns true if documents in this field store offsets.

Usage

From source file:com.twentyn.patentSearch.DocumentSearch.java

License:Open Source License

public static void main(String[] args) throws Exception {
    System.out.println("Starting up...");
    System.out.flush();// w  ww. j av  a 2s  .  co m
    Options opts = new Options();
    opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read")
            .build());
    opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
    opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

    opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build());
    opts.addOption(
            Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build());
    opts.addOption(Option.builder("l").longOpt("list-file").hasArg()
            .desc("A file containing a list of queries to run in sequence").build());
    opts.addOption(
            Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build());
    opts.addOption(Option.builder("d").longOpt("dump").hasArg()
            .desc("Dump terms in the document index for a specified field").build());
    opts.addOption(
            Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build());
    opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg()
            .desc("The index of the InChI field if an input TSV is specified.").build());
    opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg()
            .desc("The index of the chemical synonym field if an input TSV is specified.").build());

    HelpFormatter helpFormatter = new HelpFormatter();
    CommandLineParser cmdLineParser = new DefaultParser();
    CommandLine cmdLine = null;
    try {
        cmdLine = cmdLineParser.parse(opts, args);
    } catch (ParseException e) {
        System.out.println("Caught exception when parsing command line: " + e.getMessage());
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("help")) {
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(0);
    }

    if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field")
            && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) {
        System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}");
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("verbose")) {
        // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
        LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
        Configuration ctxConfig = ctx.getConfiguration();
        LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
        logConfig.setLevel(Level.DEBUG);

        ctx.updateLoggers();
        LOGGER.debug("Verbose logging enabled");
    }

    ObjectMapper objectMapper = new ObjectMapper();
    objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
    objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);

    LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));

    try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());
            IndexReader indexReader = DirectoryReader.open(indexDir);) {
        if (cmdLine.hasOption("enumerate")) {
            /* Enumerate all documents in the index.
             * With help from
             * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index
             */
            for (int i = 0; i < indexReader.maxDoc(); i++) {
                Document doc = indexReader.document(i);
                LOGGER.info("Doc " + i + ":");
                LOGGER.info(doc);
            }
        } else if (cmdLine.hasOption("dump")) {
            /* Dump indexed terms for a specific field.
             * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */
            Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump"));
            LOGGER.info("Has positions: " + terms.hasPositions());
            LOGGER.info("Has offsets:   " + terms.hasOffsets());
            LOGGER.info("Has freqs:     " + terms.hasFreqs());
            LOGGER.info("Stats:         " + terms.getStats());
            LOGGER.info(terms);
            TermsEnum termsEnum = terms.iterator();
            BytesRef br = null;
            while ((br = termsEnum.next()) != null) {
                LOGGER.info("  " + br.utf8ToString());
            }

        } else {
            IndexSearcher searcher = new IndexSearcher(indexReader);
            String field = cmdLine.getOptionValue("field");

            List<Pair<String, String>> queries = null;
            if (cmdLine.hasOption("query")) {
                queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query")));
            } else if (cmdLine.hasOption("list-file")) {
                if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) {
                    LOGGER.error("Must specify both inchi-field and synonym-field when using list-file.");
                    System.exit(1);
                }
                Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field"));
                Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field"));

                queries = new LinkedList<>();
                BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file")));
                String line;
                while ((line = r.readLine()) != null) {
                    line = line.trim();
                    if (!line.isEmpty()) {
                        // TODO: use a proper TSV reader; this is intentionally terrible as is.
                        String[] fields = line.split("\t");
                        queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField]));
                    }
                }
                r.close();
            }

            if (queries == null || queries.size() == 0) {
                LOGGER.error("Found no queries to run.");
                return;
            }

            List<SearchResult> searchResults = new ArrayList<>(queries.size());
            for (Pair<String, String> queryPair : queries) {
                String inchi = queryPair.getLeft();
                String rawQueryString = queryPair.getRight();
                /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities
                 * as query directives, which is not what we want at all.  Phrase queries seem to work adequately
                 * with the analyzer we're currently using. */
                String queryString = rawQueryString.trim().toLowerCase();
                String[] parts = queryString.split("\\s+");
                PhraseQuery query = new PhraseQuery();
                for (String p : parts) {
                    query.add(new Term(field, p));
                }
                LOGGER.info("Running query: " + query.toString());

                BooleanQuery bq = new BooleanQuery();
                bq.add(query, BooleanClause.Occur.MUST);
                bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD);

                LOGGER.info("  Full query: " + bq.toString());

                TopDocs topDocs = searcher.search(bq, 100);
                ScoreDoc[] scoreDocs = topDocs.scoreDocs;
                if (scoreDocs.length == 0) {
                    LOGGER.info("Search returned no results.");
                }
                List<ResultDocument> results = new ArrayList<>(scoreDocs.length);
                for (int i = 0; i < scoreDocs.length; i++) {
                    ScoreDoc scoreDoc = scoreDocs[i];
                    Document doc = indexReader.document(scoreDoc.doc);
                    LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": "
                            + doc.get("id") + ", " + doc.get("title"));
                    results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"),
                            doc.get("id"), null));
                }
                LOGGER.info("----- Done with query " + query.toString());
                // TODO: reduce memory usage when not writing results to an output file.
                searchResults.add(new SearchResult(inchi, rawQueryString, bq, results));
            }

            if (cmdLine.hasOption("output")) {
                try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) {
                    writer.write(objectMapper.writeValueAsString(searchResults));
                }
            }
        }
    }
}

From source file:org.apache.blur.lucene.warmup.IndexWarmup.java

License:Apache License

public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context)
        throws IOException {
    Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>();
    if (atomicReader instanceof SegmentReader) {
        SegmentReader segmentReader = (SegmentReader) atomicReader;
        Directory directory = segmentReader.directory();
        if (!(directory instanceof TraceableDirectory)) {
            LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory,
                    context);/*from   w ww .j  a va2  s.  com*/
            return results;
        }
        IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize);
        String fileName = getSampleFileName(segmentReader.getSegmentName());
        List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>();
        if (directory.fileExists(fileName)) {
            IndexInput input = directory.openInput(fileName, IOContext.READONCE);
            segmentTraces = read(input);
            input.close();
        } else {
            Fields fields = atomicReader.fields();
            for (String field : fields) {
                LOG.debug("Context [{1}] sampling field [{0}].", field, context);
                Terms terms = fields.terms(field);
                boolean hasOffsets = terms.hasOffsets();
                boolean hasPayloads = terms.hasPayloads();
                boolean hasPositions = terms.hasPositions();

                tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets);
                IndexTracerResult result = tracer.runTrace(terms);
                segmentTraces.add(result);
            }
            if (_isClosed.get()) {
                LOG.info("Context [{0}] index closed", context);
                return null;
            }
            IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT);
            write(segmentTraces, output);
            output.close();
        }
        results.put(segmentReader.getSegmentName(), segmentTraces);
    }
    return results;
}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;/*  w w  w  .  ja v a  2s.co  m*/
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, curentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();

}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void initValues(Terms curTerms, DocsAndPositionsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            curentPositions[j] = nextPos;
        }/*  www.  j  a  v a 2  s  .  c o  m*/
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }

        }
    }
}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void initMemory(Terms curTerms, int termFreq) {
    // init memory for performance reasons
    if (curTerms.hasPositions()) {
        curentPositions = ArrayUtil.grow(curentPositions, termFreq);
    }// ww w  .  j a v a  2s . c o m
    if (curTerms.hasOffsets()) {
        currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
        currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
    }
    if (curTerms.hasPayloads()) {
        currentPayloads = new BytesArray[termFreq];
    }
}

From source file:org.elasticsearch.action.termvector.TermVectorWriter.java

License:Apache License

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
        Fields topLevelFields) throws IOException {

    int numFieldsWritten = 0;
    TermsEnum iterator = null;//from   w w  w  .  j a v a2  s .  c o m
    DocsAndPositionsEnum docsAndPosEnum = null;
    DocsEnum docsEnum = null;
    TermsEnum topLevelIterator = null;
    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }

        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);

        topLevelIterator = topLevelTerms.iterator(topLevelIterator);
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
        startField(field, fieldTermVector.size(), positions, offsets, payloads);
        if (flags.contains(Flag.FieldStatistics)) {
            writeFieldStatistics(topLevelTerms);
        }
        iterator = fieldTermVector.iterator(iterator);
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) { // iterate all terms of the
            // current field
            // get the doc frequency
            BytesRef term = iterator.term();
            boolean foundTerm = topLevelIterator.seekExact(term);
            assert (foundTerm);
            startTerm(term);
            if (flags.contains(Flag.TermStatistics)) {
                writeTermStatistics(topLevelIterator);
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets,
                        payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a DocsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics),
            flags.contains(Flag.FieldStatistics)));
}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;/* ww w. ja  v a 2s  . com*/
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, currentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();
}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }/*from  w  w  w.j av  a2s.com*/
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

private void initMemory(Terms curTerms, int termFreq) {
    // init memory for performance reasons
    if (curTerms.hasPositions()) {
        currentPositions = ArrayUtil.grow(currentPositions, termFreq);
    }/*  ww  w  .  j a v  a2s .  co  m*/
    if (curTerms.hasOffsets()) {
        currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
        currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
    }
    if (curTerms.hasPayloads()) {
        currentPayloads = new BytesArray[termFreq];
    }
}

From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java

License:Apache License

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
        Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter)
        throws IOException {
    int numFieldsWritten = 0;
    PostingsEnum docsAndPosEnum = null;//w w  w  . j a va  2s. co  m
    PostingsEnum docsEnum = null;
    boolean hasScores = termVectorsFilter != null;

    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }

        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);

        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = fieldTermVector;
        }

        TermsEnum topLevelIterator = topLevelTerms.iterator();
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();

        long termsSize = fieldTermVector.size();
        if (hasScores) {
            termsSize = Math.min(termsSize, termVectorsFilter.size(field));
        }
        startField(field, termsSize, positions, offsets, payloads);

        if (flags.contains(Flag.FieldStatistics)) {
            if (dfs != null) {
                writeFieldStatistics(dfs.fieldStatistics().get(field));
            } else {
                writeFieldStatistics(topLevelTerms);
            }
        }
        TermsEnum iterator = fieldTermVector.iterator();
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) { // iterate all terms of the current field
            BytesRef termBytesRef = iterator.term();
            Term term = new Term(field, termBytesRef);

            // with filtering we only keep the best terms
            if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
                continue;
            }

            startTerm(termBytesRef);
            if (flags.contains(Flag.TermStatistics)) {
                // get the doc frequency
                if (dfs != null) {
                    final TermStatistics statistics = dfs.termStatistics().get(term);
                    writeTermStatistics(
                            statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
                } else {
                    boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
                    if (foundTerm) {
                        writeTermStatistics(topLevelIterator);
                    } else {
                        writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
                    }
                }
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets,
                        payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a PostingsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
            if (hasScores) {
                writeScoreTerm(termVectorsFilter.getScoreTerm(term));
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorsField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics),
            flags.contains(Flag.FieldStatistics), hasScores));
}