List of usage examples for org.apache.lucene.index IndexableField readerValue
public Reader readerValue();
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *//*from w w w .j a v a2 s . c om*/ public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:org.opencms.search.CmsLuceneDocument.java
License:Open Source License
/** * @see org.opencms.search.I_CmsSearchDocument#getContentBlob() */// www. ja v a2 s . co m public byte[] getContentBlob() { IndexableField fieldContentBlob = m_doc.getField(CmsSearchField.FIELD_CONTENT_BLOB); if (fieldContentBlob != null) { try { if (fieldContentBlob.readerValue() != null) { return IOUtils.toByteArray(fieldContentBlob.readerValue()); } } catch (IOException e) { // TODO: } } return null; }
From source file:org.opengrok.indexer.index.IndexDatabase.java
License:Open Source License
/** * Do a best effort to clean up all resources allocated when populating * a Lucene document. On normal execution, these resources should be * closed automatically by the index writer once it's done with them, but * we may not get that far if something fails. * * @param doc the document whose resources to clean up *//*from w w w . ja v a 2s . co m*/ private static void cleanupResources(Document doc) { for (IndexableField f : doc) { // If the field takes input from a reader, close the reader. IOUtils.close(f.readerValue()); // If the field takes input from a token stream, close the // token stream. if (f instanceof Field) { IOUtils.close(((Field) f).tokenStreamValue()); } } }
From source file:org.opensolaris.opengrok.index.IndexDatabase.java
License:Open Source License
/** * Do a best effort to clean up all resources allocated when populating * a Lucene document. On normal execution, these resources should be * closed automatically by the index writer once it's done with them, but * we may not get that far if something fails. * * @param doc the document whose resources to clean up *//*w w w. j a va 2s . co m*/ private void cleanupResources(Document doc) { for (IndexableField f : doc) { // If the field takes input from a reader, close the reader. IOUtils.close(f.readerValue()); // If the field takes input from a token stream, close the // token stream. if (f instanceof Field) { IOUtils.close(((Field) f).tokenStreamValue()); } } }