List of usage examples for org.apache.lucene.document FieldType setStored
public void setStored(boolean value)
true to store this field. From source file:alix.lucene.Alix.java
License:Open Source License
/** * Parse field type String// ww w. j av a 2 s .co m * * @param name Name of the field * @param value Value of the field * @param options a string composed of letters in any order following Luke convention to describe fields * IdfpoPSV * I: Indexed * d: docs * f: freqs * p: pos * o: offset * P: payloads * S: Stored * V: TermVector */ public static FieldType fieldType(String options) { FieldType type; if (options == null) return new FieldType(); if ("S".equals(options)) { type = new FieldType(); type.setStored(true); return type; } if (options.contains("S")) { type = new FieldType(TextField.TYPE_STORED); } else { type = new FieldType(TextField.TYPE_NOT_STORED); } // optimize ? type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (options.contains("p")) { type.setStoreTermVectorPositions(true); } if (options.contains("o")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorOffsets(true); } if (options.contains("P")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorPositions(true); type.setStoreTermVectorPayloads(true); } if (options.contains("V")) { type.setTokenized(true); type.setStoreTermVectors(true); } return type; }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document//from ww w . java 2 s .co m * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.utils.NewsItemLuceneDocConverter.java
License:Apache License
private static FieldType getTextType() { FieldType ftype = new FieldType(); ftype.setIndexed(true);// w w w.j a v a2s. c o m ftype.setStoreTermVectors(true); ftype.setStored(true); ftype.freeze(); return ftype; }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.LuceneIndexBolt.java
License:Apache License
private FieldType getType() { FieldType ftype = new FieldType(); ftype.setIndexed(true);//w w w . j av a 2 s .co m ftype.setStoreTermVectors(true); ftype.setStored(true); return ftype; }
From source file:cc.twittertools.index.IndexStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(// w w w. j av a2 s.c o m OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexStatuses.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexed(true); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:ci6226.buildindex.java
/** * @param args the command line arguments *//* w w w.j a v a 2 s . c o m*/ public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json"; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); // List<Document> jdocs = new LinkedList<Document>(); Date start = new Date(); String indexPath = "./myindex"; System.out.println("Indexing to directory '" + indexPath + "'..."); // Analyzer analyzer= new NGramAnalyzer(2,8); Analyzer analyzer = new myAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: // TODO: try different analyzer,stop words,words steming check size // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // Add new documents to an existing index: // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); // writer.addDocuments(jdocs); int line = 0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); Field business_idf = new StringField("business_id", business_id, Field.Store.YES); doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);//? ft.setStoreTermVectorOffsets(true);//??? Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); // BufferedReader in = new BufferedReader(new FileReader(file)); //while (in.ready()) { // String s = in.readLine(); // //System.out.println(s); // JSONObject jsonObject = (JSONObject) ((Object)s); // String rtext = (String) jsonObject.get("text"); // System.out.println(rtext); // //long age = (Long) jsonObject.get("age"); // //System.out.println(age); //} //in.close(); }
From source file:ci6226.eval_index_writer.java
public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException { String file = _iReviewLocation; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); Date start = new Date(); String indexPath = "./" + _dir; System.out.println("Indexing to directory '" + indexPath + "'..."); Analyzer analyzer = _analyzer;/*from w w w.j a v a 2s . c om*/ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, iwc); // int line=0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); // Field business_idf = new StringField("business_id", business_id, Field.Store.YES); // doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// ft.setStoreTermVectorOffsets(true);// Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); // System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); }
From source file:com.epam.catgenome.dao.index.FeatureIndexDao.java
License:Open Source License
private void addCommonDocumentFields(Document document, FeatureIndexEntry entry, final Long featureFileId) { document.add(new SortedStringField(FeatureIndexFields.FEATURE_ID.getFieldName(), entry.getFeatureId())); FieldType fieldType = new FieldType(); fieldType.setOmitNorms(true);/* w ww .java 2s. co m*/ fieldType.setIndexOptions(IndexOptions.DOCS); fieldType.setStored(true); fieldType.setTokenized(false); fieldType.setDocValuesType(DocValuesType.SORTED); fieldType.freeze(); Field field = new Field(FeatureIndexFields.CHROMOSOME_ID.getFieldName(), entry.getChromosome() != null ? new BytesRef(entry.getChromosome().getId().toString()) : new BytesRef(""), fieldType); document.add(field); document.add(new SortedStringField(FeatureIndexFields.CHROMOSOME_NAME.getFieldName(), entry.getChromosome().getName(), true)); document.add(new SortedIntPoint(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new StoredField(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.START_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new SortedIntPoint(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new StoredField(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.END_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new StringField(FeatureIndexFields.FEATURE_TYPE.getFieldName(), entry.getFeatureType() != null ? entry.getFeatureType().getFileValue() : "", Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FILE_ID.getFieldName(), featureFileId.toString(), Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FEATURE_NAME.getFieldName(), entry.getFeatureName() != null ? entry.getFeatureName().toLowerCase() : "", Field.Store.YES)); document.add(new SortedDocValuesField(FeatureIndexFields.FEATURE_NAME.getFieldName(), new BytesRef(entry.getFeatureName() != null ? entry.getFeatureName() : ""))); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.CHR_ID.getFieldName(), entry.getChromosome().getId().toString())); document.add(new SortedStringField(FeatureIndexFields.UID.getFieldName(), entry.getUuid().toString())); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.F_UID.getFieldName(), entry.getUuid().toString())); }
From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java
License:BEER-WARE LICENSE
private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) { com.github.hotware.lucene.extension.bean.type.Type typeWrapper; try {//from ww w .ja v a 2 s. c o m // TODO: maybe cache these? typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); } FieldType fieldType = new FieldType(); fieldType.setIndexed(bf.index()); fieldType.setStored(bf.store()); fieldType.setTokenized(bf.tokenized()); fieldType.setStoreTermVectors(bf.storeTermVectors()); fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions()); fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets()); fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads()); fieldType.setOmitNorms(bf.omitNorms()); fieldType.setIndexOptions(bf.indexOptions()); typeWrapper.configureFieldType(fieldType); fieldType.freeze(); return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf); }
From source file:com.globalsight.ling.lucene.IndexDocument.java
License:Apache License
static public Document IndexDocument(long p_mainId, long p_subId, String p_text) { Document result = new Document(); FieldType ft; // Add the main id (tu id, concept id) as a field named // "mainid". Use a Keyword field so that the id is stored // with the document, and is searchable. //result.add(Field.Keyword(MAINID, String.valueOf(p_mainId))); ft = new FieldType(); ft.setTokenized(false);/* w ww. j av a 2 s . c o m*/ ft.setIndexed(false); ft.setStored(true); result.add(new Field(MAINID, String.valueOf(p_mainId), ft)); // Add the sub id (tuv id, term id) as a field named // "subid". Use a Keyword field so that the id is stored // with the document, and is searchable. //result.add(Field.Keyword(SUBID, String.valueOf(p_subId))); ft = new FieldType(); ft.setTokenized(false); ft.setIndexed(false); ft.setStored(true); result.add(new Field(SUBID, String.valueOf(p_subId), ft)); // Add the contents as an UnStored field so it will get // tokenized and indexed, but not stored. // result.add(Field.UnStored(TEXT, p_text)); ft = new FieldType(); ft.setTokenized(true); ft.setIndexed(true); ft.setStored(false); result.add(new Field(TEXT, p_text, ft)); return result; }