List of usage examples for org.apache.lucene.index IndexOptions DOCS_AND_FREQS
IndexOptions DOCS_AND_FREQS
To view the source code for org.apache.lucene.index IndexOptions DOCS_AND_FREQS.
Click Source Link
From source file:com.lucure.core.codec.LucurePostingsWriter.java
License:Apache License
@Override public int setField(FieldInfo fieldInfo) { IndexOptions indexOptions = fieldInfo.getIndexOptions(); fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; fieldHasPayloads = fieldInfo.hasPayloads(); skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); lastState = emptyState;//from w ww . j av a2s .c om if (fieldHasPositions) { if (fieldHasPayloads || fieldHasOffsets) { return 3; // doc + pos + pay FP } else { return 2; // doc + pos FP } } else { return 1; // doc FP } }
From source file:com.rocana.lucene.codec.v1.RocanaFieldReader.java
License:Apache License
@Override public boolean hasFreqs() { return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
public void test() throws Exception { Directory dir = newDirectory();//from w w w .ja v a2 s. co m Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); if (fieldName.contains("payloadsFixed")) { TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); return new TokenStreamComponents(tokenizer, filter); } else if (fieldName.contains("payloadsVariable")) { TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); return new TokenStreamComponents(tokenizer, filter); } else { return new TokenStreamComponents(tokenizer); } } }; IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsOnlyType.setStoreTermVectors(true); docsOnlyType.setIndexOptions(IndexOptions.DOCS); FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsAndFreqsType.setStoreTermVectors(true); docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check positionsType.setStoreTermVectors(true); positionsType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorPayloads(true); FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field field1 = new Field("field1docs", "", docsOnlyType); Field field2 = new Field("field2freqs", "", docsAndFreqsType); Field field3 = new Field("field3positions", "", positionsType); Field field4 = new Field("field4offsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType); Field field6 = new Field("field6payloadsVariable", "", positionsType); Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); doc.add(field7); doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); field2.setStringValue(stringValue); field3.setStringValue(stringValue); field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); field7.setStringValue(stringValue); field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); verify(dir); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc); iw2.forceMerge(1); iw2.close(); verify(dir); dir.close(); }
From source file:com.vmware.xenon.services.common.Lucene60FieldInfosFormatWithCache.java
License:Open Source License
private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException { switch (b) {/*from w ww . j a v a 2 s. co m*/ case 0: return IndexOptions.NONE; case 1: return IndexOptions.DOCS; case 2: return IndexOptions.DOCS_AND_FREQS; case 3: return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; case 4: return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; default: // BUG throw new CorruptIndexException("invalid IndexOptions byte: " + b, input); } }
From source file:io.anserini.index.generator.LuceneDocumentGenerator.java
License:Apache License
public Document createDocument(SourceDocument src) { String id = src.id();/*w w w. j a v a 2 s. c o m*/ String contents; try { // If there's a transform, use it. contents = transform != null ? transform.apply(src.content()) : src.content(); } catch (Exception e) { LOG.error("Error extracting document text, skipping document: " + id, e); counters.errors.incrementAndGet(); return null; } if (contents.trim().length() == 0) { LOG.info("Empty document: " + id); counters.emptyDocuments.incrementAndGet(); return null; } // make a new, empty document Document document = new Document(); // document id document.add(new StringField(FIELD_ID, id, Field.Store.YES)); if (args.storeRawDocs) { document.add(new StoredField(FIELD_RAW, src.content())); } FieldType fieldType = new FieldType(); fieldType.setStored(args.storeTransformedDocs); // Are we storing document vectors? if (args.storeDocvectors) { fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); } // Are we building a "positional" or "count" index? if (args.storePositions) { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } document.add(new Field(FIELD_BODY, contents, fieldType)); return document; }
From source file:io.anserini.index.generator.TweetGenerator.java
License:Apache License
@Override public Document createDocument(TweetCollection.Document tweetDoc) { String id = tweetDoc.id();//w w w .j a v a2 s. com if (tweetDoc.content().trim().isEmpty()) { counters.empty.incrementAndGet(); return null; } final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim()); if (!result.isValid) { counters.errors.incrementAndGet(); return null; } String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end); if (!args.tweetKeepUrls) { final Extractor extractor = new Extractor(); final List<String> urls = extractor.extractURLs(text); for (String url : urls) { text = text.replaceAll(url, ""); } } text = text.trim(); if (text.isEmpty()) { counters.empty.incrementAndGet(); return null; } // Skip deletes tweetids. if (deletes != null && deletes.contains(id)) { counters.skipped.incrementAndGet(); return null; } if (tweetDoc.getIdLong() > args.tweetMaxId) { counters.skipped.incrementAndGet(); return null; } if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) { counters.skipped.incrementAndGet(); return null; } Document doc = new Document(); doc.add(new StringField(FIELD_ID, id, Field.Store.YES)); // We need this to break scoring ties. doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong())); doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong())); tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch))); doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO)); doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount())); doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount())); doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount())); tweetDoc.getInReplyToStatusId().ifPresent(rid -> { doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid)); tweetDoc.getInReplyToUserId() .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid))); }); tweetDoc.getRetweetedStatusId().ifPresent(rid -> { doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid)); tweetDoc.getRetweetedUserId() .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid))); tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc))); }); tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO))); if (args.storeRawDocs) { // store the raw json string as one single field doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString())); } FieldType fieldType = new FieldType(); fieldType.setStored(args.storeTransformedDocs); // Are we storing document vectors? if (args.storeDocvectors) { fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); } // Are we building a "positional" or "count" index? if (args.storePositions) { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } doc.add(new Field(FIELD_BODY, text, fieldType)); return doc; }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void buildTestIndex() throws IOException { Directory dir = FSDirectory.open(tempDir1); Analyzer analyzer = new EnglishAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textOptions.setStored(true);/* w ww .j a v a 2s. c om*/ textOptions.setTokenized(true); textOptions.setStoreTermVectors(true); textOptions.setStoreTermVectorPositions(true); Document doc1 = new Document(); doc1.add(new StringField("docid", "doc1", Field.Store.YES)); doc1.add(new Field("text", "here is some text here is some more text", textOptions)); writer.addDocument(doc1); Document doc2 = new Document(); doc2.add(new StringField("docid", "doc2", Field.Store.YES)); doc2.add(new Field("text", "more text", textOptions)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new StringField("docid", "doc3", Field.Store.YES)); doc3.add(new Field("text", "here is a test", textOptions)); writer.addDocument(doc3); writer.commit(); writer.forceMerge(1); writer.close(); }
From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;//from ww w. java 2 s .co m try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = null; if (stopsetType.equalsIgnoreCase("CUSTOM")) { stopsetPath = properties.getProperty("stopset.path"); } String corporaRootPath = properties.getProperty("corpora.path"); int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size")); String[] corpora = properties.getProperty(language + ".corpora").split(";"); TrecContentSource trecContentSource = new TrecContentSource(); try { Properties configProps = new Properties(); configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser"); configProps.setProperty("content.source.verbose", "false"); configProps.setProperty("content.source.forever", "false"); configProps.setProperty("content.source.excludeIteration", "true"); configProps.setProperty("work.dir", new File(".").getAbsolutePath()); configProps.setProperty("language", language); configProps.setProperty("stemmer", stemmer); configProps.setProperty("stopset_type", stopsetType); configProps.setProperty("stopset_path", stopsetPath); // set lucene index directory Path indexPath = new File(properties.getProperty("index.path")).toPath(); Directory directory = new SimpleFSDirectory(indexPath); // indexing configuration CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setSimilarity(new BM25Similarity()); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, conf); boolean storePositions = true; FieldType bodyFieldType = new FieldType(); if (storePositions) { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } for (String corpus : corpora) { int docCount = 0; logger.info("... indexing corpus " + corpus); try { configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus); configProps.setProperty("content.source.encoding", properties.getProperty(corpus + ".encoding", "UTF-8")); trecContentSource.setConfig(new Config(configProps)); DocData docData = new DocData(); while ((docData = trecContentSource.getNextDocData(docData)) != null) { docCount++; // System.out.println("ID: "+docData.getName()); // System.out.println("BODY: "+docData.getBody()); Document doc = getDocumentFromDocData(docData, bodyFieldType); indexWriter.addDocument(doc); } } catch (NoMoreDataException e) { logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n"); } } indexWriter.close(); DirectoryReader ireader = DirectoryReader.open(directory); if (corpusSize != ireader.numDocs()) { throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be " + corpusSize); } logger.info("Number of documents: " + ireader.numDocs()); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.aksw.lucene.extractor.DocumentExtractor.java
License:Apache License
/** * * @param street//from ww w .j av a 2s . co m * @param document * @throws IOException */ private static void updateDocument(String street, String document) throws IOException { IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory)); IndexSearcher searcher = new IndexSearcher(reader); BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(new Term(IndexField.CITY, city.toLowerCase())), BooleanClause.Occur.MUST); bq.add(new TermQuery(new Term(IndexField.DESCRIPTION, street.toLowerCase())), BooleanClause.Occur.MUST); LOG.debug("Filtering using the following parameters..."); LOG.debug("Street:%s".format(street)); LOG.debug("City:%s".format(city)); ScoreDoc[] hits = searcher.search(bq, Integer.MAX_VALUE).scoreDocs; if (hits.length != 0) { Document doc = searcher.doc(hits[0].doc); boolean hasDocument = false; for (IndexableField f : doc.getFields(IndexField.DOCUMENT)) { hasDocument = f.stringValue().contains(document); if (hasDocument) break; } if (!hasDocument) { FieldType fieldType = new FieldType(); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); fieldType.setIndexed(true); fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS); fieldType.setStored(true); doc.add(new Field(IndexField.DOCUMENT, document, fieldType)); writer.updateDocument(new Term(IndexField.DESCRIPTION, street.toLowerCase()), doc); writer.commit(); LOG.debug("commit done!"); } } }
From source file:org.codelibs.elasticsearch.index.mapper.KeywordFieldMapper.java
License:Apache License
protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, int ignoreAbove, Boolean includeInAll, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0; this.ignoreAbove = ignoreAbove; this.includeInAll = includeInAll; }