List of usage examples for org.apache.lucene.document FieldType freeze
public void freeze()
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.utils.NewsItemLuceneDocConverter.java
License:Apache License
private static FieldType getTextType() { FieldType ftype = new FieldType(); ftype.setIndexed(true);// w w w.ja va 2 s. c o m ftype.setStoreTermVectors(true); ftype.setStored(true); ftype.freeze(); return ftype; }
From source file:com.epam.catgenome.dao.index.FeatureIndexDao.java
License:Open Source License
private void addCommonDocumentFields(Document document, FeatureIndexEntry entry, final Long featureFileId) { document.add(new SortedStringField(FeatureIndexFields.FEATURE_ID.getFieldName(), entry.getFeatureId())); FieldType fieldType = new FieldType(); fieldType.setOmitNorms(true);//from ww w. j a v a2 s . c om fieldType.setIndexOptions(IndexOptions.DOCS); fieldType.setStored(true); fieldType.setTokenized(false); fieldType.setDocValuesType(DocValuesType.SORTED); fieldType.freeze(); Field field = new Field(FeatureIndexFields.CHROMOSOME_ID.getFieldName(), entry.getChromosome() != null ? new BytesRef(entry.getChromosome().getId().toString()) : new BytesRef(""), fieldType); document.add(field); document.add(new SortedStringField(FeatureIndexFields.CHROMOSOME_NAME.getFieldName(), entry.getChromosome().getName(), true)); document.add(new SortedIntPoint(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new StoredField(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.START_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new SortedIntPoint(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new StoredField(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex())); document.add(new SortedDocValuesField(FeatureIndexFields.END_INDEX.getGroupName(), new BytesRef(entry.getStartIndex().toString()))); document.add(new StringField(FeatureIndexFields.FEATURE_TYPE.getFieldName(), entry.getFeatureType() != null ? entry.getFeatureType().getFileValue() : "", Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FILE_ID.getFieldName(), featureFileId.toString(), Field.Store.YES)); document.add(new StringField(FeatureIndexFields.FEATURE_NAME.getFieldName(), entry.getFeatureName() != null ? entry.getFeatureName().toLowerCase() : "", Field.Store.YES)); document.add(new SortedDocValuesField(FeatureIndexFields.FEATURE_NAME.getFieldName(), new BytesRef(entry.getFeatureName() != null ? entry.getFeatureName() : ""))); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.CHR_ID.getFieldName(), entry.getChromosome().getId().toString())); document.add(new SortedStringField(FeatureIndexFields.UID.getFieldName(), entry.getUuid().toString())); document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.F_UID.getFieldName(), entry.getUuid().toString())); }
From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java
License:BEER-WARE LICENSE
private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) { com.github.hotware.lucene.extension.bean.type.Type typeWrapper; try {//from w ww . jav a2 s . com // TODO: maybe cache these? typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); } FieldType fieldType = new FieldType(); fieldType.setIndexed(bf.index()); fieldType.setStored(bf.store()); fieldType.setTokenized(bf.tokenized()); fieldType.setStoreTermVectors(bf.storeTermVectors()); fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions()); fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets()); fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads()); fieldType.setOmitNorms(bf.omitNorms()); fieldType.setIndexOptions(bf.indexOptions()); typeWrapper.configureFieldType(fieldType); fieldType.freeze(); return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf); }
From source file:com.o19s.solr.swan.highlight.TermVectorFun.java
License:Apache License
@Test public void testBlah() throws IOException { RAMDirectory ramDir = new RAMDirectory(); // Index some made up content IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47)); IndexWriter writer = new IndexWriter(ramDir, iwf); FieldType ft = new FieldType(); ft.setIndexed(true);/*from w w w . j av a 2s. c o m*/ ft.setTokenized(true); ft.setStored(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.freeze(); for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); StringField id = new StringField("id", "doc_" + i, StringField.Store.YES); doc.add(id); // Store both position and offset information Field text = new Field("content", DOCS[i], ft); // Field.Index.ANALYZED, // Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(text); writer.addDocument(doc); } //writer.close(); // Get a searcher AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true)); IndexSearcher searcher = new IndexSearcher(dr); // Do a search using SpanQuery SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece")); TopDocs results = searcher.search(fleeceQ, 10); for (int i = 0; i < results.scoreDocs.length; i++) { ScoreDoc scoreDoc = results.scoreDocs[i]; System.out.println("Score Doc: " + scoreDoc); } IndexReader reader = searcher.getIndexReader(); Bits acceptDocs = null; Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>(); Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts); while (spans.next()) { System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end()); DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content"); reader.document(spans.doc(), visitor); Terms terms = reader.getTermVector(spans.doc(), "content"); TermsEnum tenum = terms.iterator(null); // AttributeSource as = tenum.attributes(); while (tenum.next() != null) { System.out.println(tenum.term().utf8ToString()); } for (long pos = 0L; pos < spans.end(); pos++) { // tenum.next(); // if (tenum.ord()<pos) continue; // System.out.println(tenum.term()); // } reader.document(spans.doc(), visitor); // String[] values = visitor.getDocument().getValues("content"); // List<String> a = new ArrayList<String>(); // // build up the window // tvm.start = spans.start() - window; // tvm.end = spans.end() + window; // reader.getTermFreqVector(spans.doc(), "content", tvm); // for (WindowEntry entry : tvm.entries.values()) { // System.out.println("Entry: " + entry); // } // // clear out the entries for the next round // tvm.entries.clear(); } }
From source file:edu.umass.cs.ciir.IndexFromGalago.java
License:Open Source License
public static void main(String[] args) throws Exception { Parameters argp = Parameters.parseArgs(args); String galagoIndexPath = null; String luceneIndexPath = null; try {/*from ww w. j a va 2 s . c om*/ galagoIndexPath = argp.getString("galagoIndex"); luceneIndexPath = argp.getString("luceneIndex"); } catch (Exception e) { System.out.println(getUsage()); return; } logger.setUseParentHandlers(false); FileHandler lfh = new FileHandler("indexing-errors.log"); SimpleFormatter formatter = new SimpleFormatter(); lfh.setFormatter(formatter); logger.addHandler(lfh); final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath)); final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus"); long total = corpus.getManifest().getLong("keyCount"); final CorpusReader.KeyIterator iterator = corpus.getIterator(); final Document.DocumentComponents dcp = Document.DocumentComponents.JustText; // Analyzer includes options for text processing Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions) TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer()); // Step 2: transforming all tokens into lowercased ones ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(), new LowerCaseFilter(ts.getTokenStream())); // Step 3: whether to remove stop words // Uncomment the following line to remove stop words // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) ); // Step 4: whether to apply stemming // Uncomment the following line to apply Krovetz or Porter stemmer // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) ); // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) ); return ts; } }; try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) { final IndexWriterConfig cfg = new IndexWriterConfig(analyzer); System.out.println("Similarity: " + cfg.getSimilarity()); cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(dir, cfg)) { iterator.forAllKeyStrings(docId -> { try { Document document = iterator.getDocument(dcp); String text = document.text; String id = document.name; System.out.println("Processing document: " + id); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new StringField("id", id, Field.Store.YES)); // this stores the actual text with tags so formatting is preserved doc.add(new StoredField("body", text)); org.jsoup.nodes.Document jsoup = Jsoup.parse(text); // tokens of the document FieldType fieldTypeText = new FieldType(); fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldTypeText.setStoreTermVectors(true); fieldTypeText.setStoreTermVectorPositions(true); fieldTypeText.setTokenized(true); fieldTypeText.setStored(false); fieldTypeText.freeze(); doc.add(new Field("tokens", jsoup.text(), fieldTypeText)); try { writer.addDocument(doc); System.out.println("Doc count: " + writer.numDocs()); } catch (IOException e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } } catch (Exception e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } }); } } System.out.println("Indexing Done. "); }
From source file:org.aksw.palmetto.corpus.lucene.creation.PositionStoringLuceneIndexCreator.java
License:Open Source License
/** * Creates the index./* ww w. j av a 2 s.c om*/ * * @param indexPath * The path to the director in which the Lucene index will be created * @param docIterator * Iterator that iterates over the document texts. * @return true if the creation was successful, else false. */ public boolean createIndex(File indexPath, Iterator<IndexableDocument> docIterator) { LOGGER.info("Starting index creation..."); IndexWriter writer = null; indexPath.mkdirs(); Analyzer analyzer = new SimpleAnalyzer(true); try { IndexWriterConfig config = new IndexWriterConfig(version, analyzer); config.setOpenMode(OpenMode.CREATE); FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); fieldType.setIndexed(true); fieldType.setStoreTermVectors(true); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldType.freeze(); FieldType docLengthFieldType = new FieldType(IntField.TYPE_STORED); docLengthFieldType.setIndexed(false); docLengthFieldType.freeze(); writer = new IndexWriter(FSDirectory.open(indexPath), config); int count = 0; Document indexDocument; IndexableDocument currentDocument; while (docIterator.hasNext()) { currentDocument = docIterator.next(); if (currentDocument.getText().length() > 0) { indexDocument = toLuceneDocument(analyzer, currentDocument.getText(), fieldType); addDocumentLength(indexDocument, docLengthFieldName, docLengthFieldType, currentDocument.getNumberOfTokens()); writer.addDocument(indexDocument); ++count; if (count >= commitInterval) { writer.commit(); System.gc(); count = 0; } } } LOGGER.info("Finished index creation."); } catch (IOException e) { LOGGER.error("Error while creating Index. Aborting.", e); return false; } finally { if (writer != null) { try { writer.close(); } catch (Exception e) { } } } return true; }
From source file:org.aksw.palmetto.corpus.lucene.creation.SimpleLuceneIndexCreator.java
License:Open Source License
/** * Creates the index.//from w w w. j a v a2 s . c o m * * @param indexPath * The path to the director in which the Lucene index will be created * @param docIterator * Iterator that iterates over the document texts. * @return true if the creation was successful, else false. */ public boolean createIndex(File indexPath, Iterator<String> docIterator) { LOGGER.info("Starting index creation..."); IndexWriter writer = null; indexPath.mkdirs(); Analyzer analyzer = new SimpleAnalyzer(true); try { IndexWriterConfig config = new IndexWriterConfig(version, analyzer); config.setOpenMode(OpenMode.CREATE); FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); fieldType.setIndexed(true); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); fieldType.freeze(); writer = new IndexWriter(FSDirectory.open(indexPath), config); String text; int count = 0; while (docIterator.hasNext()) { text = docIterator.next(); if (text.length() > 0) { writer.addDocument(toLuceneDocument(analyzer, text, fieldType)); ++count; if (count >= commitInterval) { writer.commit(); System.gc(); count = 0; } } else { LOGGER.warn("Got a document without content."); } } LOGGER.info("Finished index creation."); } catch (IOException e) { LOGGER.error("Error while creating Index. Aborting.", e); return false; } finally { if (writer != null) { try { writer.close(); } catch (Exception e) { } } } return true; }
From source file:org.apache.camel.component.lucene.LuceneIndexer.java
License:Apache License
private static FieldType createFieldType(boolean tokenized) { FieldType answer = new FieldType(); answer.setIndexed(true);/* ww w. ja va 2s . co m*/ answer.setStored(true); answer.setTokenized(tokenized); // freeze the answer so that it becomes immutable answer.freeze(); return answer; }
From source file:org.apache.mahout.clustering.TestClusterDumper.java
License:Apache License
private void getSampleData(String[] docs2) throws IOException { sampleData = Lists.newArrayList();/*w ww .jav a 2 s.c om*/ RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46))); try { for (int i = 0; i < docs2.length; i++) { Document doc = new Document(); Field id = new StringField("id", "doc_" + i, Field.Store.YES); doc.add(id); // Store both position and offset information FieldType fieldType = new FieldType(); fieldType.setStored(false); fieldType.setIndexed(true); fieldType.setTokenized(true); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); fieldType.setStoreTermVectorOffsets(true); fieldType.freeze(); Field text = new Field("content", docs2[i], fieldType); doc.add(text); writer.addDocument(doc); } } finally { Closeables.close(writer, false); } IndexReader reader = DirectoryReader.open(directory); Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); int numTerms = 0; for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { it.next(); numTerms++; } termDictionary = new String[numTerms]; int i = 0; for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { String term = it.next().getTerm(); termDictionary[i] = term; System.out.println(i + " " + term); i++; } Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", termInfo, weight); i = 0; for (Vector vector : iterable) { assertNotNull(vector); NamedVector namedVector; if (vector instanceof NamedVector) { // rename it for testing purposes namedVector = new NamedVector(((NamedVector) vector).getDelegate(), "P(" + i + ')'); } else { namedVector = new NamedVector(vector, "P(" + i + ')'); } System.out.println(AbstractCluster.formatVector(namedVector, termDictionary)); sampleData.add(new VectorWritable(namedVector)); i++; } }
From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfoTest.java
License:Apache License
@Before public void before() throws IOException { directory = new RAMDirectory(); FieldType fieldType = new FieldType(); fieldType.setStored(false);//from w ww . j a v a 2s.c o m fieldType.setIndexed(true); fieldType.setTokenized(true); fieldType.setStoreTermVectors(false); fieldType.setStoreTermVectorPositions(false); fieldType.setStoreTermVectorOffsets(false); fieldType.freeze(); directory = createTestIndex(fieldType, directory, 0); }