List of usage examples for org.apache.lucene.document FieldType setStoreTermVectorPositions
public void setStoreTermVectorPositions(boolean value)
true to also store token positions into the term vector for this field. From source file:alix.lucene.Alix.java
License:Open Source License
/** * Parse field type String/* ww w . j a va2 s . c om*/ * * @param name Name of the field * @param value Value of the field * @param options a string composed of letters in any order following Luke convention to describe fields * IdfpoPSV * I: Indexed * d: docs * f: freqs * p: pos * o: offset * P: payloads * S: Stored * V: TermVector */ public static FieldType fieldType(String options) { FieldType type; if (options == null) return new FieldType(); if ("S".equals(options)) { type = new FieldType(); type.setStored(true); return type; } if (options.contains("S")) { type = new FieldType(TextField.TYPE_STORED); } else { type = new FieldType(TextField.TYPE_NOT_STORED); } // optimize ? type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (options.contains("p")) { type.setStoreTermVectorPositions(true); } if (options.contains("o")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorOffsets(true); } if (options.contains("P")) { type.setTokenized(true); type.setStoreTermVectors(true); type.setStoreTermVectorPositions(true); type.setStoreTermVectorPayloads(true); } if (options.contains("V")) { type.setTokenized(true); type.setStoreTermVectors(true); } return type; }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document//from w w w . j ava2s . co m * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }
From source file:ci6226.buildindex.java
/** * @param args the command line arguments *//*from w ww .j ava 2s . c om*/ public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json"; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); // List<Document> jdocs = new LinkedList<Document>(); Date start = new Date(); String indexPath = "./myindex"; System.out.println("Indexing to directory '" + indexPath + "'..."); // Analyzer analyzer= new NGramAnalyzer(2,8); Analyzer analyzer = new myAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: // TODO: try different analyzer,stop words,words steming check size // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // Add new documents to an existing index: // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); // writer.addDocuments(jdocs); int line = 0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); Field business_idf = new StringField("business_id", business_id, Field.Store.YES); doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);//? ft.setStoreTermVectorOffsets(true);//??? Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); // BufferedReader in = new BufferedReader(new FileReader(file)); //while (in.ready()) { // String s = in.readLine(); // //System.out.println(s); // JSONObject jsonObject = (JSONObject) ((Object)s); // String rtext = (String) jsonObject.get("text"); // System.out.println(rtext); // //long age = (Long) jsonObject.get("age"); // //System.out.println(age); //} //in.close(); }
From source file:ci6226.eval_index_writer.java
public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException { String file = _iReviewLocation; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); Date start = new Date(); String indexPath = "./" + _dir; System.out.println("Indexing to directory '" + indexPath + "'..."); Analyzer analyzer = _analyzer;//from www. j ava 2 s . com IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, iwc); // int line=0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); // Field business_idf = new StringField("business_id", business_id, Field.Store.YES); // doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// ft.setStoreTermVectorOffsets(true);// Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); // System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); }
From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java
License:BEER-WARE LICENSE
private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) { com.github.hotware.lucene.extension.bean.type.Type typeWrapper; try {/*from w w w .j ava 2 s .c o m*/ // TODO: maybe cache these? typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); } FieldType fieldType = new FieldType(); fieldType.setIndexed(bf.index()); fieldType.setStored(bf.store()); fieldType.setTokenized(bf.tokenized()); fieldType.setStoreTermVectors(bf.storeTermVectors()); fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions()); fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets()); fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads()); fieldType.setOmitNorms(bf.omitNorms()); fieldType.setIndexOptions(bf.indexOptions()); typeWrapper.configureFieldType(fieldType); fieldType.freeze(); return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf); }
From source file:com.meizu.nlp.classification.ClassificationTestBase.java
License:Apache License
private void populatePerformanceIndex(Analyzer analyzer) throws IOException { indexWriter.close();//from w w w .j a v a 2 s.c om indexWriter = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); indexWriter.commit(); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int docs = 1000; Random random = random(); for (int i = 0; i < docs; i++) { boolean b = random.nextBoolean(); Document doc = new Document(); doc.add(new Field(textFieldName, createRandomString(random), ft)); doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft)); doc.add(new Field(booleanFieldName, String.valueOf(b), ft)); indexWriter.addDocument(doc); } indexWriter.commit(); }
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *///from w ww . jav a2 s . co m public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:com.meizu.nlp.classification.utils.DataSplitterTest.java
License:Apache License
@Override @Before/*from www . jav a 2s . com*/ public void setUp() throws Exception { super.setUp(); dir = newDirectory(); indexWriter = new RandomIndexWriter(random(), dir); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Document doc; Random rnd = random(); for (int i = 0; i < 100; i++) { doc = new Document(); doc.add(new Field(idFieldName, Integer.toString(i), ft)); doc.add(new Field(textFieldName, TestUtil.randomUnicodeString(rnd, 1024), ft)); doc.add(new Field(classFieldName, TestUtil.randomUnicodeString(rnd, 10), ft)); indexWriter.addDocument(doc); } indexWriter.commit(); originalIndex = SlowCompositeReaderWrapper.wrap(indexWriter.getReader()); }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java
License:Apache License
@Override @Before//from ww w. jav a2 s . com public void setUp() throws Exception { super.setUp(); dir = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Document doc; for (int i = 0; i < 10; i++) { doc = new Document(); doc.add(new Field("id", Integer.toString(i), ft)); doc.add(new Field("text", random().nextInt(10) + " " + random().nextInt(10) + " " + random().nextInt(10), ft)); indexWriter.addDocument(doc); } indexWriter.commit(); index = indexWriter.getReader(); indexWriter.close(); }
From source file:com.o19s.solr.swan.highlight.TermVectorFun.java
License:Apache License
@Test public void testBlah() throws IOException { RAMDirectory ramDir = new RAMDirectory(); // Index some made up content IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47)); IndexWriter writer = new IndexWriter(ramDir, iwf); FieldType ft = new FieldType(); ft.setIndexed(true);// w w w . j a v a 2 s .c om ft.setTokenized(true); ft.setStored(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.freeze(); for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); StringField id = new StringField("id", "doc_" + i, StringField.Store.YES); doc.add(id); // Store both position and offset information Field text = new Field("content", DOCS[i], ft); // Field.Index.ANALYZED, // Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(text); writer.addDocument(doc); } //writer.close(); // Get a searcher AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true)); IndexSearcher searcher = new IndexSearcher(dr); // Do a search using SpanQuery SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece")); TopDocs results = searcher.search(fleeceQ, 10); for (int i = 0; i < results.scoreDocs.length; i++) { ScoreDoc scoreDoc = results.scoreDocs[i]; System.out.println("Score Doc: " + scoreDoc); } IndexReader reader = searcher.getIndexReader(); Bits acceptDocs = null; Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>(); Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts); while (spans.next()) { System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end()); DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content"); reader.document(spans.doc(), visitor); Terms terms = reader.getTermVector(spans.doc(), "content"); TermsEnum tenum = terms.iterator(null); // AttributeSource as = tenum.attributes(); while (tenum.next() != null) { System.out.println(tenum.term().utf8ToString()); } for (long pos = 0L; pos < spans.end(); pos++) { // tenum.next(); // if (tenum.ord()<pos) continue; // System.out.println(tenum.term()); // } reader.document(spans.doc(), visitor); // String[] values = visitor.getDocument().getValues("content"); // List<String> a = new ArrayList<String>(); // // build up the window // tvm.start = spans.start() - window; // tvm.end = spans.end() + window; // reader.getTermFreqVector(spans.doc(), "content", tvm); // for (WindowEntry entry : tvm.entries.values()) { // System.out.println("Entry: " + entry); // } // // clear out the entries for the next round // tvm.entries.clear(); } }