Example usage for org.apache.lucene.index Terms hasPositions

List of usage examples for org.apache.lucene.index Terms hasPositions

Introduction

In this page you can find the example usage for org.apache.lucene.index Terms hasPositions.

Prototype

public abstract boolean hasPositions();

Source Link

Document

Returns true if documents in this field store positions.

Usage

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
    if (leftTerms == null || rightTerms == null) {
        assertNull(leftTerms);/*from   www .  j ava  2  s .  c  o m*/
        assertNull(rightTerms);
        return;
    }
    assertTermsStatistics(leftTerms, rightTerms);

    // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

    boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
    TermsEnum leftTermsEnum = leftTerms.iterator();
    TermsEnum rightTermsEnum = rightTerms.iterator();
    assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHavePositions);

    assertTermsSeeking(leftTerms, rightTerms);

    if (deep) {
        int numIntersections = atLeast(3);
        for (int i = 0; i < numIntersections; i++) {
            String re = AutomatonTestUtil.randomRegexp(random());
            CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
            if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // TODO: test start term too
                TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
                TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
                assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHavePositions);
            }
        }
    }
}

From source file:com.twentyn.patentSearch.DocumentSearch.java

License:Open Source License

public static void main(String[] args) throws Exception {
    System.out.println("Starting up...");
    System.out.flush();/*from   ww w  .  ja  v  a  2s .  c  om*/
    Options opts = new Options();
    opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read")
            .build());
    opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
    opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

    opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build());
    opts.addOption(
            Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build());
    opts.addOption(Option.builder("l").longOpt("list-file").hasArg()
            .desc("A file containing a list of queries to run in sequence").build());
    opts.addOption(
            Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build());
    opts.addOption(Option.builder("d").longOpt("dump").hasArg()
            .desc("Dump terms in the document index for a specified field").build());
    opts.addOption(
            Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build());
    opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg()
            .desc("The index of the InChI field if an input TSV is specified.").build());
    opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg()
            .desc("The index of the chemical synonym field if an input TSV is specified.").build());

    HelpFormatter helpFormatter = new HelpFormatter();
    CommandLineParser cmdLineParser = new DefaultParser();
    CommandLine cmdLine = null;
    try {
        cmdLine = cmdLineParser.parse(opts, args);
    } catch (ParseException e) {
        System.out.println("Caught exception when parsing command line: " + e.getMessage());
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("help")) {
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(0);
    }

    if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field")
            && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) {
        System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}");
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("verbose")) {
        // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
        LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
        Configuration ctxConfig = ctx.getConfiguration();
        LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
        logConfig.setLevel(Level.DEBUG);

        ctx.updateLoggers();
        LOGGER.debug("Verbose logging enabled");
    }

    ObjectMapper objectMapper = new ObjectMapper();
    objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
    objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);

    LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));

    try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());
            IndexReader indexReader = DirectoryReader.open(indexDir);) {
        if (cmdLine.hasOption("enumerate")) {
            /* Enumerate all documents in the index.
             * With help from
             * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index
             */
            for (int i = 0; i < indexReader.maxDoc(); i++) {
                Document doc = indexReader.document(i);
                LOGGER.info("Doc " + i + ":");
                LOGGER.info(doc);
            }
        } else if (cmdLine.hasOption("dump")) {
            /* Dump indexed terms for a specific field.
             * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */
            Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump"));
            LOGGER.info("Has positions: " + terms.hasPositions());
            LOGGER.info("Has offsets:   " + terms.hasOffsets());
            LOGGER.info("Has freqs:     " + terms.hasFreqs());
            LOGGER.info("Stats:         " + terms.getStats());
            LOGGER.info(terms);
            TermsEnum termsEnum = terms.iterator();
            BytesRef br = null;
            while ((br = termsEnum.next()) != null) {
                LOGGER.info("  " + br.utf8ToString());
            }

        } else {
            IndexSearcher searcher = new IndexSearcher(indexReader);
            String field = cmdLine.getOptionValue("field");

            List<Pair<String, String>> queries = null;
            if (cmdLine.hasOption("query")) {
                queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query")));
            } else if (cmdLine.hasOption("list-file")) {
                if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) {
                    LOGGER.error("Must specify both inchi-field and synonym-field when using list-file.");
                    System.exit(1);
                }
                Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field"));
                Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field"));

                queries = new LinkedList<>();
                BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file")));
                String line;
                while ((line = r.readLine()) != null) {
                    line = line.trim();
                    if (!line.isEmpty()) {
                        // TODO: use a proper TSV reader; this is intentionally terrible as is.
                        String[] fields = line.split("\t");
                        queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField]));
                    }
                }
                r.close();
            }

            if (queries == null || queries.size() == 0) {
                LOGGER.error("Found no queries to run.");
                return;
            }

            List<SearchResult> searchResults = new ArrayList<>(queries.size());
            for (Pair<String, String> queryPair : queries) {
                String inchi = queryPair.getLeft();
                String rawQueryString = queryPair.getRight();
                /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities
                 * as query directives, which is not what we want at all.  Phrase queries seem to work adequately
                 * with the analyzer we're currently using. */
                String queryString = rawQueryString.trim().toLowerCase();
                String[] parts = queryString.split("\\s+");
                PhraseQuery query = new PhraseQuery();
                for (String p : parts) {
                    query.add(new Term(field, p));
                }
                LOGGER.info("Running query: " + query.toString());

                BooleanQuery bq = new BooleanQuery();
                bq.add(query, BooleanClause.Occur.MUST);
                bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD);

                LOGGER.info("  Full query: " + bq.toString());

                TopDocs topDocs = searcher.search(bq, 100);
                ScoreDoc[] scoreDocs = topDocs.scoreDocs;
                if (scoreDocs.length == 0) {
                    LOGGER.info("Search returned no results.");
                }
                List<ResultDocument> results = new ArrayList<>(scoreDocs.length);
                for (int i = 0; i < scoreDocs.length; i++) {
                    ScoreDoc scoreDoc = scoreDocs[i];
                    Document doc = indexReader.document(scoreDoc.doc);
                    LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": "
                            + doc.get("id") + ", " + doc.get("title"));
                    results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"),
                            doc.get("id"), null));
                }
                LOGGER.info("----- Done with query " + query.toString());
                // TODO: reduce memory usage when not writing results to an output file.
                searchResults.add(new SearchResult(inchi, rawQueryString, bq, results));
            }

            if (cmdLine.hasOption("output")) {
                try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) {
                    writer.write(objectMapper.writeValueAsString(searchResults));
                }
            }
        }
    }
}

From source file:nl.inl.blacklab.search.SearcherImpl.java

License:Apache License

@Override
public void getCharacterOffsets(int doc, String fieldName, int[] startsOfWords, int[] endsOfWords,
        boolean fillInDefaultsIfNotFound) {

    if (startsOfWords.length == 0)
        return; // nothing to do
    try {// w  w  w . j  a v a 2  s  .  c  o  m
        // Determine lowest and highest word position we'd like to know something about.
        // This saves a little bit of time for large result sets.
        int minP = -1, maxP = -1;
        int numStarts = startsOfWords.length;
        int numEnds = endsOfWords.length;
        for (int i = 0; i < numStarts; i++) {
            if (startsOfWords[i] < minP || minP == -1)
                minP = startsOfWords[i];
            if (startsOfWords[i] > maxP)
                maxP = startsOfWords[i];
        }
        for (int i = 0; i < numEnds; i++) {
            if (endsOfWords[i] < minP || minP == -1)
                minP = endsOfWords[i];
            if (endsOfWords[i] > maxP)
                maxP = endsOfWords[i];
        }
        if (minP < 0 || maxP < 0)
            throw new RuntimeException("Can't determine min and max positions");

        String fieldPropName = ComplexFieldUtil.mainPropertyOffsetsField(indexStructure, fieldName);

        org.apache.lucene.index.Terms terms = reader.getTermVector(doc, fieldPropName);
        if (terms == null)
            throw new IllegalArgumentException(
                    "Field " + fieldPropName + " in doc " + doc + " has no term vector");
        if (!terms.hasPositions())
            throw new IllegalArgumentException(
                    "Field " + fieldPropName + " in doc " + doc + " has no character postion information");

        //int lowestPos = -1, highestPos = -1;
        int lowestPosFirstChar = -1, highestPosLastChar = -1;
        int total = numStarts + numEnds;
        boolean[] done = new boolean[total]; // NOTE: array is automatically initialized to zeroes!
        int found = 0;

        // Iterate over terms
        TermsEnum termsEnum = terms.iterator();
        while (termsEnum.next() != null) {
            PostingsEnum dpe = termsEnum.postings(null, null, PostingsEnum.POSITIONS);

            // Iterate over docs containing this term (NOTE: should be only one doc!)
            while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                // Iterate over positions of this term in this doc
                int positionsRead = 0;
                int numberOfPositions = dpe.freq();
                while (positionsRead < numberOfPositions) {
                    int position = dpe.nextPosition();
                    if (position == -1)
                        break;
                    positionsRead++;

                    // Keep track of the lowest and highest char pos, so
                    // we can fill in the character positions we didn't find
                    int startOffset = dpe.startOffset();
                    if (startOffset < lowestPosFirstChar || lowestPosFirstChar == -1) {
                        lowestPosFirstChar = startOffset;
                    }
                    int endOffset = dpe.endOffset();
                    if (endOffset > highestPosLastChar) {
                        highestPosLastChar = endOffset;
                    }

                    // We've calculated the min and max word positions in advance, so
                    // we know we can skip this position if it's outside the range we're interested in.
                    // (Saves a little time for large result sets)
                    if (position < minP || position > maxP) {
                        continue;
                    }

                    for (int m = 0; m < numStarts; m++) {
                        if (!done[m] && position == startsOfWords[m]) {
                            done[m] = true;
                            startsOfWords[m] = startOffset;
                            found++;
                        }
                    }
                    for (int m = 0; m < numEnds; m++) {
                        if (!done[numStarts + m] && position == endsOfWords[m]) {
                            done[numStarts + m] = true;
                            endsOfWords[m] = endOffset;
                            found++;
                        }
                    }

                    // NOTE: we might be tempted to break here if found == total,
                    // but that would foul up our calculation of highestPosLastChar and
                    // lowestPosFirstChar.
                }
            }

        }
        if (found < total) {
            if (!fillInDefaultsIfNotFound)
                throw new RuntimeException("Could not find all character offsets!");

            if (lowestPosFirstChar < 0 || highestPosLastChar < 0)
                throw new RuntimeException("Could not find default char positions!");

            for (int m = 0; m < numStarts; m++) {
                if (!done[m])
                    startsOfWords[m] = lowestPosFirstChar;
            }
            for (int m = 0; m < numEnds; m++) {
                if (!done[numStarts + m])
                    endsOfWords[m] = highestPosLastChar;
            }
        }

    } catch (IOException e) {
        throw ExUtil.wrapRuntimeException(e);
    }
}

From source file:org.apache.blur.lucene.warmup.IndexWarmup.java

License:Apache License

public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context)
        throws IOException {
    Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>();
    if (atomicReader instanceof SegmentReader) {
        SegmentReader segmentReader = (SegmentReader) atomicReader;
        Directory directory = segmentReader.directory();
        if (!(directory instanceof TraceableDirectory)) {
            LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory,
                    context);/*w  w w.ja  va2s.  c o m*/
            return results;
        }
        IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize);
        String fileName = getSampleFileName(segmentReader.getSegmentName());
        List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>();
        if (directory.fileExists(fileName)) {
            IndexInput input = directory.openInput(fileName, IOContext.READONCE);
            segmentTraces = read(input);
            input.close();
        } else {
            Fields fields = atomicReader.fields();
            for (String field : fields) {
                LOG.debug("Context [{1}] sampling field [{0}].", field, context);
                Terms terms = fields.terms(field);
                boolean hasOffsets = terms.hasOffsets();
                boolean hasPayloads = terms.hasPayloads();
                boolean hasPositions = terms.hasPositions();

                tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets);
                IndexTracerResult result = tracer.runTrace(terms);
                segmentTraces.add(result);
            }
            if (_isClosed.get()) {
                LOG.info("Context [{0}] index closed", context);
                return null;
            }
            IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT);
            write(segmentTraces, output);
            output.close();
        }
        results.put(segmentReader.getSegmentName(), segmentTraces);
    }
    return results;
}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;//  ww  w .  ja  v a  2s.c  o m
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, curentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();

}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void initValues(Terms curTerms, DocsAndPositionsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            curentPositions[j] = nextPos;
        }/*w  w w. j  a v  a 2  s . c  o  m*/
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }

        }
    }
}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

private void initMemory(Terms curTerms, int termFreq) {
    // init memory for performance reasons
    if (curTerms.hasPositions()) {
        curentPositions = ArrayUtil.grow(curentPositions, termFreq);
    }/*from   w  w  w.j  a va 2s.  c  om*/
    if (curTerms.hasOffsets()) {
        currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
        currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
    }
    if (curTerms.hasPayloads()) {
        currentPayloads = new BytesArray[termFreq];
    }
}

From source file:org.elasticsearch.action.termvector.TermVectorWriter.java

License:Apache License

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
        Fields topLevelFields) throws IOException {

    int numFieldsWritten = 0;
    TermsEnum iterator = null;/*from   w  w w . ja va 2  s .c  om*/
    DocsAndPositionsEnum docsAndPosEnum = null;
    DocsEnum docsEnum = null;
    TermsEnum topLevelIterator = null;
    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }

        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);

        topLevelIterator = topLevelTerms.iterator(topLevelIterator);
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
        startField(field, fieldTermVector.size(), positions, offsets, payloads);
        if (flags.contains(Flag.FieldStatistics)) {
            writeFieldStatistics(topLevelTerms);
        }
        iterator = fieldTermVector.iterator(iterator);
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) { // iterate all terms of the
            // current field
            // get the doc frequency
            BytesRef term = iterator.term();
            boolean foundTerm = topLevelIterator.seekExact(term);
            assert (foundTerm);
            startTerm(term);
            if (flags.contains(Flag.TermStatistics)) {
                writeTermStatistics(topLevelIterator);
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets,
                        payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a DocsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics),
            flags.contains(Flag.FieldStatistics)));
}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;// ww w . ja v a 2 s .  c o  m
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, currentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();
}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }/*from   w w w.  j a v a2s .  com*/
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}