Example usage for org.apache.lucene.index IndexOptions DOCS_AND_FREQS

List of usage examples for org.apache.lucene.index IndexOptions DOCS_AND_FREQS

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexOptions DOCS_AND_FREQS.

Prototype

IndexOptions DOCS_AND_FREQS

To view the source code for org.apache.lucene.index IndexOptions DOCS_AND_FREQS.

Click Source Link

Document

Only documents and term frequencies are indexed: positions are omitted.

Usage

From source file:com.lucure.core.codec.LucurePostingsWriter.java

License:Apache License

@Override
public int setField(FieldInfo fieldInfo) {
    IndexOptions indexOptions = fieldInfo.getIndexOptions();
    fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
    fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
    fieldHasPayloads = fieldInfo.hasPayloads();
    skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
    lastState = emptyState;//from   w  ww  .  j av a2s  .c  om
    if (fieldHasPositions) {
        if (fieldHasPayloads || fieldHasOffsets) {
            return 3; // doc + pos + pay FP
        } else {
            return 2; // doc + pos FP
        }
    } else {
        return 1; // doc FP
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaFieldReader.java

License:Apache License

@Override
public boolean hasFreqs() {
    return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

public void test() throws Exception {
    Directory dir = newDirectory();//from  w  w  w .ja v a2  s.  co  m
    Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            if (fieldName.contains("payloadsFixed")) {
                TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
                return new TokenStreamComponents(tokenizer, filter);
            } else if (fieldName.contains("payloadsVariable")) {
                TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
                return new TokenStreamComponents(tokenizer, filter);
            } else {
                return new TokenStreamComponents(tokenizer);
            }
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    // TODO we could actually add more fields implemented with different PFs
    // or, just put this test into the usual rotation?
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsOnlyType.setStoreTermVectors(true);
    docsOnlyType.setIndexOptions(IndexOptions.DOCS);

    FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsAndFreqsType.setStoreTermVectors(true);
    docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn these on for a cross-check
    positionsType.setStoreTermVectors(true);
    positionsType.setStoreTermVectorPositions(true);
    positionsType.setStoreTermVectorOffsets(true);
    positionsType.setStoreTermVectorPayloads(true);
    FieldType offsetsType = new FieldType(positionsType);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    Field field1 = new Field("field1docs", "", docsOnlyType);
    Field field2 = new Field("field2freqs", "", docsAndFreqsType);
    Field field3 = new Field("field3positions", "", positionsType);
    Field field4 = new Field("field4offsets", "", offsetsType);
    Field field5 = new Field("field5payloadsFixed", "", positionsType);
    Field field6 = new Field("field6payloadsVariable", "", positionsType);
    Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
    Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
    doc.add(field1);
    doc.add(field2);
    doc.add(field3);
    doc.add(field4);
    doc.add(field5);
    doc.add(field6);
    doc.add(field7);
    doc.add(field8);
    for (int i = 0; i < MAXDOC; i++) {
        String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ')
                + " " + TestUtil.randomSimpleString(random());
        field1.setStringValue(stringValue);
        field2.setStringValue(stringValue);
        field3.setStringValue(stringValue);
        field4.setStringValue(stringValue);
        field5.setStringValue(stringValue);
        field6.setStringValue(stringValue);
        field7.setStringValue(stringValue);
        field8.setStringValue(stringValue);
        iw.addDocument(doc);
    }
    iw.close();
    verify(dir);
    TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
    iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    iwc.setOpenMode(OpenMode.APPEND);
    IndexWriter iw2 = new IndexWriter(dir, iwc);
    iw2.forceMerge(1);
    iw2.close();
    verify(dir);
    dir.close();
}

From source file:com.vmware.xenon.services.common.Lucene60FieldInfosFormatWithCache.java

License:Open Source License

private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException {
    switch (b) {/*from   w  ww . j  a  v a  2  s.  co  m*/
    case 0:
        return IndexOptions.NONE;
    case 1:
        return IndexOptions.DOCS;
    case 2:
        return IndexOptions.DOCS_AND_FREQS;
    case 3:
        return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
    case 4:
        return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
    default:
        // BUG
        throw new CorruptIndexException("invalid IndexOptions byte: " + b, input);
    }
}

From source file:io.anserini.index.generator.LuceneDocumentGenerator.java

License:Apache License

public Document createDocument(SourceDocument src) {
    String id = src.id();/*w  w  w. j a  v  a  2  s.  c  o m*/
    String contents;

    try {
        // If there's a transform, use it.
        contents = transform != null ? transform.apply(src.content()) : src.content();
    } catch (Exception e) {
        LOG.error("Error extracting document text, skipping document: " + id, e);
        counters.errors.incrementAndGet();
        return null;
    }

    if (contents.trim().length() == 0) {
        LOG.info("Empty document: " + id);
        counters.emptyDocuments.incrementAndGet();
        return null;
    }

    // make a new, empty document
    Document document = new Document();

    // document id
    document.add(new StringField(FIELD_ID, id, Field.Store.YES));

    if (args.storeRawDocs) {
        document.add(new StoredField(FIELD_RAW, src.content()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    document.add(new Field(FIELD_BODY, contents, fieldType));

    return document;
}

From source file:io.anserini.index.generator.TweetGenerator.java

License:Apache License

@Override
public Document createDocument(TweetCollection.Document tweetDoc) {
    String id = tweetDoc.id();//w  w w  .j a v a2 s.  com

    if (tweetDoc.content().trim().isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }
    final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim());
    if (!result.isValid) {
        counters.errors.incrementAndGet();
        return null;
    }
    String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end);

    if (!args.tweetKeepUrls) {
        final Extractor extractor = new Extractor();
        final List<String> urls = extractor.extractURLs(text);
        for (String url : urls) {
            text = text.replaceAll(url, "");
        }
    }
    text = text.trim();
    if (text.isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }

    // Skip deletes tweetids.
    if (deletes != null && deletes.contains(id)) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (tweetDoc.getIdLong() > args.tweetMaxId) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
        counters.skipped.incrementAndGet();
        return null;
    }

    Document doc = new Document();
    doc.add(new StringField(FIELD_ID, id, Field.Store.YES));

    // We need this to break scoring ties.
    doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong()));
    doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong()));

    tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch)));
    doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
    doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
    doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
    doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));

    tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid));
        tweetDoc.getInReplyToUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid)));
    });

    tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid));
        tweetDoc.getRetweetedUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid)));
        tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc)));
    });

    tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO)));

    if (args.storeRawDocs) { // store the raw json string as one single field
        doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    doc.add(new Field(FIELD_BODY, text, fieldType));

    return doc;
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void buildTestIndex() throws IOException {
    Directory dir = FSDirectory.open(tempDir1);

    Analyzer analyzer = new EnglishAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);

    FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    textOptions.setStored(true);/* w ww .j a  v a  2s.  c om*/
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);
    textOptions.setStoreTermVectorPositions(true);

    Document doc1 = new Document();
    doc1.add(new StringField("docid", "doc1", Field.Store.YES));
    doc1.add(new Field("text", "here is some text here is some more text", textOptions));
    writer.addDocument(doc1);

    Document doc2 = new Document();
    doc2.add(new StringField("docid", "doc2", Field.Store.YES));
    doc2.add(new Field("text", "more text", textOptions));
    writer.addDocument(doc2);

    Document doc3 = new Document();
    doc3.add(new StringField("docid", "doc3", Field.Store.YES));
    doc3.add(new Field("text", "here is a test", textOptions));
    writer.addDocument(doc3);

    writer.commit();
    writer.forceMerge(1);
    writer.close();
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//from   ww w.  java 2  s  .co  m
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:org.aksw.lucene.extractor.DocumentExtractor.java

License:Apache License

/**
 *
 * @param street//from  ww w  .j  av  a 2s  . co  m
 * @param document
 * @throws IOException
 */
private static void updateDocument(String street, String document) throws IOException {

    IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory));
    IndexSearcher searcher = new IndexSearcher(reader);

    BooleanQuery bq = new BooleanQuery();
    bq.add(new TermQuery(new Term(IndexField.CITY, city.toLowerCase())), BooleanClause.Occur.MUST);
    bq.add(new TermQuery(new Term(IndexField.DESCRIPTION, street.toLowerCase())), BooleanClause.Occur.MUST);

    LOG.debug("Filtering using the following parameters...");
    LOG.debug("Street:%s".format(street));
    LOG.debug("City:%s".format(city));

    ScoreDoc[] hits = searcher.search(bq, Integer.MAX_VALUE).scoreDocs;

    if (hits.length != 0) {

        Document doc = searcher.doc(hits[0].doc);

        boolean hasDocument = false;

        for (IndexableField f : doc.getFields(IndexField.DOCUMENT)) {
            hasDocument = f.stringValue().contains(document);
            if (hasDocument)
                break;

        }

        if (!hasDocument) {
            FieldType fieldType = new FieldType();
            fieldType.setStoreTermVectors(true);
            fieldType.setStoreTermVectorPositions(true);
            fieldType.setIndexed(true);
            fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
            fieldType.setStored(true);
            doc.add(new Field(IndexField.DOCUMENT, document, fieldType));

            writer.updateDocument(new Term(IndexField.DESCRIPTION, street.toLowerCase()), doc);
            writer.commit();

            LOG.debug("commit done!");
        }
    }

}

From source file:org.codelibs.elasticsearch.index.mapper.KeywordFieldMapper.java

License:Apache License

protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
        int ignoreAbove, Boolean includeInAll, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
    super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
    assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
    this.ignoreAbove = ignoreAbove;
    this.includeInAll = includeInAll;
}