List of usage examples for org.apache.lucene.document FieldType setIndexOptions
public void setIndexOptions(IndexOptions value)
From source file:alix.lucene.Alix.java
License:Open Source License
/** * Parse field type String/*w w w . j av a 2s. c o m*/ * * @param name Name of the field * @param value Value of the field * @param options a string composed of letters in any order following Luke convention to describe fields * IdfpoPSV * I: Indexed * d: docs * f: freqs * p: pos * o: offset * P: payloads * S: Stored * V: TermVector */ public static FieldType fieldType(String options) { FieldType type; if (options == null) return new FieldType(); if ("S".equals(options)) { type = new FieldType(); type.setStored(true); return type; } if (options.contains("S")) { type = new FieldType(TextField.TYPE_STORED); } else { type = new FieldType(TextField.TYPE_NOT_STORED); } // optimize ? type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (options.contains("p")) { type.setStoreTermVectorPositions(true); } if (options.contains("o")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorOffsets(true); } if (options.contains("P")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorPositions(true); type.setStoreTermVectorPayloads(true); } if (options.contains("V")) { type.setTokenized(true); type.setStoreTermVectors(true); } return type; }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document/*from www. jav a2 s . c om*/ * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }
From source file:cc.twittertools.index.IndexStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(//from w ww . java2 s . c o m OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexStatuses.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexed(true); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:com.basistech.lucene.tools.LuceneQueryToolTest.java
License:Apache License
@BeforeClass public static void oneTimeSetup() throws IOException, ParseException { LuceneQueryToolTest.showOutput = false; // for debugging tests Directory dir = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(dir, config); Document doc = new Document(); doc.add(new Field("longest-mention", "Bill Clinton", StringField.TYPE_STORED)); doc.add(new Field("context", "Hillary Clinton Arkansas", TextField.TYPE_NOT_STORED)); writer.addDocument(doc);/*from w w w . ja v a 2s .c o m*/ doc = new Document(); doc.add(new Field("longest-mention", "George W. Bush", StringField.TYPE_STORED)); doc.add(new Field("context", "Texas Laura Bush", TextField.TYPE_NOT_STORED)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("longest-mention", "George H. W. Bush", StringField.TYPE_STORED)); doc.add(new Field("context", "Barbara Bush Texas", TextField.TYPE_NOT_STORED)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("bbb", "foo", StringField.TYPE_STORED)); doc.add(new Field("bbb", "bar", StringField.TYPE_STORED)); doc.add(new Field("aaa", "foo", StringField.TYPE_STORED)); FieldType typeUnindexed = new FieldType(StringField.TYPE_STORED); typeUnindexed.setIndexOptions(IndexOptions.NONE); doc.add(new Field("zzz", "foo", typeUnindexed)); writer.addDocument(doc); writer.close(); reader = DirectoryReader.open(dir); }
From source file:com.epam.catgenome.dao.index.FeatureIndexDao.java
License:Open Source License
private void addCommonDocumentFields(Document document, FeatureIndexEntry entry, final Long featureFileId) { document.add(new SortedStringField(FeatureIndexFields.FEATURE_ID.getFieldName(), entry.getFeatureId())); FieldType fieldType = new FieldType(); fieldType.setOmitNorms(true);// w w w .java 2s. c o m fieldType.setIndexOptions(IndexOptions.DOCS); fieldType.setStored(true); fieldType.setTokenized(false); fieldType.setDocValuesType(DocValuesType.SORTED); fieldType.freeze(); Field field = new Field(FeatureIndexFields.CHROMOSOME_ID.getFieldName(), entry.getChromosome() != null ? new BytesRef(entry.getChromosome().getId().toString()) : new BytesRef(""), fieldType); document.add(field); document.add(new SortedStringField(FeatureIndexFields.CHROMOSOME_NAME.getFieldName(), entry.getChromosome().getName(), true)); document.add(new SortedIntPoint(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new StoredField(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.START_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new SortedIntPoint(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new StoredField(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.END_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new StringField(FeatureIndexFields.FEATURE_TYPE.getFieldName(), entry.getFeatureType() != null ? entry.getFeatureType().getFileValue() : "", Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FILE_ID.getFieldName(), featureFileId.toString(), Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FEATURE_NAME.getFieldName(), entry.getFeatureName() != null ? entry.getFeatureName().toLowerCase() : "", Field.Store.YES)); document.add(new SortedDocValuesField(FeatureIndexFields.FEATURE_NAME.getFieldName(), new BytesRef(entry.getFeatureName() != null ? entry.getFeatureName() : ""))); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.CHR_ID.getFieldName(), entry.getChromosome().getId().toString())); document.add(new SortedStringField(FeatureIndexFields.UID.getFieldName(), entry.getUuid().toString())); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.F_UID.getFieldName(), entry.getUuid().toString())); }
From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java
License:BEER-WARE LICENSE
private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) { com.github.hotware.lucene.extension.bean.type.Type typeWrapper; try {/*from www . j a v a 2 s . c om*/ // TODO: maybe cache these? typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); } FieldType fieldType = new FieldType(); fieldType.setIndexed(bf.index()); fieldType.setStored(bf.store()); fieldType.setTokenized(bf.tokenized()); fieldType.setStoreTermVectors(bf.storeTermVectors()); fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions()); fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets()); fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads()); fieldType.setOmitNorms(bf.omitNorms()); fieldType.setIndexOptions(bf.indexOptions()); typeWrapper.configureFieldType(fieldType); fieldType.freeze(); return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf); }
From source file:com.o19s.es.ltr.query.LtrQueryTests.java
License:Apache License
private Field newField(String name, String value, Store stored) { FieldType tagsFieldType = new FieldType(); tagsFieldType.setStored(stored == Store.YES); IndexOptions idxOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; tagsFieldType.setIndexOptions(idxOptions); return new Field(name, value, tagsFieldType); }
From source file:com.orientechnologies.spatial.engine.OLuceneSpatialIndexEngineAbstract.java
License:Apache License
protected Document newGeoDocument(OIdentifiable oIdentifiable, Shape shape) { FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setStored(true);/*from w w w . j a v a2 s .c o m*/ Document doc = new Document(); doc.add(OLuceneIndexType.createField(RID, oIdentifiable.getIdentity().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); for (IndexableField f : strategy.createIndexableFields(shape)) { doc.add(f); } doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape))); return doc; }
From source file:com.ponysdk.sample.client.page.addon.SelectizeAddon.java
License:Apache License
public SelectizeAddon() { super(Element.newInput()); setTerminalHandler(this); ///*from w ww . ja v a 2 s .c o m*/ final Analyzer analyzer = new StandardAnalyzer(); final Directory directory = new RAMDirectory(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer; try { writer = new IndexWriter(directory, config); final Document doc = new Document(); final String text = "Test de ouf"; final FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.NONE); fieldType.setStored(true); fieldType.setTokenized(false); doc.add(new Field("id", "12", fieldType)); doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); writer.addDocument(doc); addAssetsType(writer); addTenor(writer); addClients(writer); addSide(writer); writer.close(); } catch (final IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { // Now search the index: final DirectoryReader ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": // final QueryParser parser = new QueryParser("fieldname", // analyzer); // parser.setFuzzyMinSim(2f); final Term term = new Term("fieldname", "indesfed"); final Query query = new FuzzyQuery(term); // final TopDocs hits = isearcher.search(query, 1000).scoreDocs; // final Query query = parser.parse("indeed"); final ScoreDoc[] hits = isearcher.search(query, 1000).scoreDocs; // Iterate through the results: for (final ScoreDoc hit : hits) { System.err.println("Score : " + hit.score); final Document hitDoc = isearcher.doc(hit.doc); System.err.println("Found document" + hitDoc.getField("fieldname").stringValue()); } // ireader.close(); // directory.close(); } catch (final Exception exception) { exception.printStackTrace(); } // <input type="text" id="input-tags3" class="demo-default" // value="science,biology,chemistry,physics"> }
From source file:com.ponysdk.sample.client.page.addon.SelectizeAddon.java
License:Apache License
private void addSide(final IndexWriter writer) throws IOException { final Document doc1 = new Document(); final FieldType fieldType1 = new FieldType(); fieldType1.setIndexOptions(IndexOptions.NONE); fieldType1.setStored(true);/*from www .ja v a 2 s . com*/ fieldType1.setTokenized(false); doc1.add(new Field("id", "sell", fieldType1)); doc1.add(new Field("fieldname", "Sell", TextField.TYPE_STORED)); doc1.add(new Field("fieldname", "S", TextField.TYPE_STORED)); doc1.add(new Field("type", Type.SIDE.name(), TextField.TYPE_STORED)); doc1.add(new Field("desc", "side", TextField.TYPE_STORED)); writer.addDocument(doc1); final Document doc2 = new Document(); final FieldType fieldType2 = new FieldType(); fieldType2.setIndexOptions(IndexOptions.NONE); fieldType2.setStored(true); fieldType2.setTokenized(false); doc2.add(new Field("id", "buy", fieldType2)); doc2.add(new Field("fieldname", "Buy", TextField.TYPE_STORED)); doc2.add(new Field("fieldname", "B", TextField.TYPE_STORED)); doc2.add(new Field("type", Type.SIDE.name(), TextField.TYPE_STORED)); doc2.add(new Field("desc", "side", TextField.TYPE_STORED)); writer.addDocument(doc2); }