List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.mathworks.xzheng.searching.PhraseQueryTest.java
License:Apache License
protected void setUp() throws IOException { dir = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, new WhitespaceAnalyzer(Version.LUCENE_46)); IndexWriter writer = new IndexWriter(dir, config); Document doc = new Document(); doc.add(new Field("field", // 1 "the quick brown fox jumped over the lazy dog", // 1 Field.Store.YES, // 1 Field.Index.ANALYZED)); // 1 writer.addDocument(doc); writer.close();/*from ww w. j a va2 s . c o m*/ searcher = new IndexSearcher(DirectoryReader.open(dir)); }
From source file:com.mathworks.xzheng.searching.ScoreTest.java
License:Apache License
private void indexSingleFieldDocs(Field[] fields) throws Exception { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, new WhitespaceAnalyzer(Version.LUCENE_46)); IndexWriter writer = new IndexWriter(directory, config); for (Field f : fields) { Document doc = new Document(); doc.add(f);/*from w ww . j a va2 s . com*/ writer.addDocument(doc); } writer.forceMerge(1); writer.close(); }
From source file:com.mathworks.xzheng.tools.BerkeleyDbIndexer.java
License:Apache License
public static void main(String[] args) throws IOException, DatabaseException { if (args.length != 1) { System.err.println("Usage: BerkeleyDbIndexer <index dir>"); System.exit(-1);//ww w. j a v a2 s .c o m } File indexFile = new File(args[0]); if (indexFile.exists()) { File[] files = indexFile.listFiles(); for (int i = 0; i < files.length; i++) if (files[i].getName().startsWith("__")) files[i].delete(); indexFile.delete(); } indexFile.mkdir(); EnvironmentConfig envConfig = new EnvironmentConfig(); DatabaseConfig dbConfig = new DatabaseConfig(); envConfig.setTransactional(true); envConfig.setInitializeCache(true); envConfig.setInitializeLocking(true); envConfig.setInitializeLogging(true); envConfig.setAllowCreate(true); envConfig.setThreaded(true); dbConfig.setAllowCreate(true); dbConfig.setType(DatabaseType.BTREE); Environment env = new Environment(indexFile, envConfig); Transaction txn = env.beginTransaction(null, null); Database index = env.openDatabase(txn, "__index__", null, dbConfig); Database blocks = env.openDatabase(txn, "__blocks__", null, dbConfig); txn.commit(); txn = env.beginTransaction(null, null); DbDirectory directory = new DbDirectory(txn, index, blocks); IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.optimize(); writer.close(); directory.close(); txn.commit(); index.close(); blocks.close(); env.close(); System.out.println("Indexing Complete"); }
From source file:com.mathworks.xzheng.tools.BerkeleyDbJEIndexer.java
License:Apache License
public static void main(String[] args) throws IOException, DatabaseException { if (args.length != 1) { System.err.println("Usage: BerkeleyDbIndexer <index dir>"); System.exit(-1);//from w w w .j a va 2 s . c o m } File indexFile = new File(args[0]); if (indexFile.exists()) { // A File[] files = indexFile.listFiles(); // A for (int i = 0; i < files.length; i++) // A if (files[i].getName().startsWith("__")) // A files[i].delete(); // A indexFile.delete(); // A } indexFile.mkdir(); EnvironmentConfig envConfig = new EnvironmentConfig(); // B DatabaseConfig dbConfig = new DatabaseConfig(); // B envConfig.setTransactional(true); // B envConfig.setAllowCreate(true); // B dbConfig.setTransactional(true); // B dbConfig.setAllowCreate(true); // B Environment env = new Environment(indexFile, envConfig); // C Transaction txn = env.beginTransaction(null, null); // C Database index = env.openDatabase(txn, "__index__", dbConfig); // C Database blocks = env.openDatabase(txn, "__blocks__", dbConfig); // C txn.commit(); // C txn = env.beginTransaction(null, null); // C JEDirectory directory = new JEDirectory(txn, index, blocks); // D IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.optimize(); writer.close(); directory.close(); txn.commit(); index.close(); blocks.close(); env.close(); System.out.println("Indexing Complete"); }
From source file:com.mathworks.xzheng.tools.ChainedFilterTest.java
License:Apache License
public void setUp() throws Exception { directory = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, new WhitespaceAnalyzer(Version.LUCENE_46)); IndexWriter writer = new IndexWriter(directory, config); Calendar cal = Calendar.getInstance(); cal.set(2009, 1, 1, 0, 0); // A for (int i = 0; i < MAX; i++) { Document doc = new Document(); doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); cal.add(Calendar.DATE, 1); }//from www . j a va2s . com writer.close(); searcher = new IndexSearcher(DirectoryReader.open(directory)); BooleanQuery bq = new BooleanQuery(); // B bq.add(new TermQuery(new Term("owner", "bob")), // B BooleanClause.Occur.SHOULD); // B bq.add(new TermQuery(new Term("owner", "sue")), // B BooleanClause.Occur.SHOULD); // B query = bq; cal.set(2099, 1, 1, 0, 0); dateFilter = TermRangeFilter.Less("date", // C new BytesRef(DateTools.timeToString( // C cal.getTimeInMillis(), // C DateTools.Resolution.DAY)));// C bobFilter = new CachingWrapperFilter( // D new QueryWrapperFilter( // D new TermQuery(new Term("owner", "bob")))); // D sueFilter = new CachingWrapperFilter( // E new QueryWrapperFilter( // E new TermQuery(new Term("owner", "sue")))); // E }
From source file:com.mathworks.xzheng.tools.FastVectorHighlighterSample.java
License:Apache License
static void makeIndex() throws IOException { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); for (String d : DOCS) { Document doc = new Document(); doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); }//from w w w. ja v a 2s. c o m writer.close(); }
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *//* w ww . j a v a 2 s. c o m*/ public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:com.meltmedia.cadmium.search.SearchContentPreprocessor.java
License:Apache License
void writeIndex(final IndexWriter indexWriter, File contentDir) throws Exception { new ContentScanTemplate(HTML_FILE_FILTER) { private Jerry.JerryParser jerryParser = null; @Override/*from w w w. j a v a2 s .co m*/ public void handleFile(File file) throws Exception { try { if (jerryParser == null) { jerryParser = Jerry.jerry().enableHtmlMode(); jerryParser.getDOMBuilder().setCaseSensitive(false); jerryParser.getDOMBuilder().setParseSpecialTagsAsCdata(true); jerryParser.getDOMBuilder().setSelfCloseVoidTags(false); jerryParser.getDOMBuilder().setConditionalCommentExpression(null); jerryParser.getDOMBuilder().setEnableConditionalComments(false); jerryParser.getDOMBuilder().setImpliedEndTags(false); jerryParser.getDOMBuilder().setIgnoreComments(true); } String htmlContent = FileUtils.readFileToString(file, "UTF-8"); Jerry jerry = jerryParser.parse(htmlContent); // if we should not index this file, move on. if (!shouldIndex(jerry)) return; String title = jerry.$("html > head > title").text(); Jerry removals = jerry.$("title,head,script,[cadmium=\"no-index\"]"); if (removals.size() > 0) { log.debug("Removing {} element[s]", removals.length()); removals.remove(); } else { log.debug("No elements to remove"); } String textContent = jerry.$("body").text(); Document doc = new Document(); doc.add(new TextField("title", title, Field.Store.YES)); doc.add(new TextField("content", textContent, Field.Store.YES)); doc.add(new TextField("path", file.getPath().replaceFirst(dataDir.getPath(), ""), Field.Store.YES)); indexWriter.addDocument(doc); } catch (Throwable t) { log.warn("Failed to index page [" + file + "]", t); } } }.scan(contentDir); }
From source file:com.miliworks.virgo.test.LuceneIndexAndSearchDemo.java
License:Apache License
/** * // w ww . j a v a2 s .c o m * ??? * @param args */ public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; // String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(true); Directory directory = null; IndexWriter iwriter = null; IndexReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new StringField("ID", "10000", Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; //QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java
License:Open Source License
public void execute() throws IOException { org.jsoup.nodes.Document jdoc = null; // pull content using Jsoup if (m_content != null && !m_content.trim().isEmpty()) { jdoc = Jsoup.parse(m_content);/*from ww w. j a va2 s. c o m*/ } else { Connection conn = Jsoup.connect(m_url); conn.validateTLSCertificates(false); conn.followRedirects(true); conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"); conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); conn.header("Accept-Language", "en-US,en;q=0.5"); conn.header("Accept-Encoding", "gzip, deflate"); conn.execute(); jdoc = conn.get(); } StringWriter text = new StringWriter(); if (m_includeMetaKeywords) { text.write(jdoc.select("meta[name=keywords]").attr("content")); text.write(" "); } if (m_includeMetaDescription) { text.write(jdoc.select("meta[name=description]").attr("content")); text.write(" "); } if (m_includePageTitle) { text.write(jdoc.select("title").text()); text.write(" "); } text.write(jdoc.select("body").text()); // analyze content with Lucene StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); Directory directory = new RAMDirectory(); IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); Document doc = new Document(); Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); doc.add(textField); indexWriter.addDocument(doc); indexWriter.commit(); indexWriter.close(); IndexReader indexReader = IndexReader.open(directory, true); TermFreqVector termFreqVector = null; for (int i = 0; i < indexReader.maxDoc(); i++) { termFreqVector = indexReader.getTermFreqVector(i, "content"); String[] terms = termFreqVector.getTerms(); int[] freqs = termFreqVector.getTermFrequencies(); for (int n = 0; n < termFreqVector.size(); n++) { if (m_excludeList.contains(terms[n])) { continue; } add(terms[n], freqs[n]); } } indexReader.close(); directory.close(); // sort map by value sortMap(); }