List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.senseidb.search.node.inmemory.InMemorySenseiService.java
License:Apache License
private void addDocuments(Directory directory, IndexWriter writer, List<JSONObject> documents) { try {/*w ww .j a va2s . com*/ writer.deleteAll(); for (JSONObject doc : documents) { if (doc == null) continue; writer.addDocument(buildDoc(doc)); pluggableSearchEngineManager.update(doc, ""); } writer.commit(); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.serendio.lingo3g.CreateLuceneIndex.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("Args: index-dir"); System.exit(-1);// ww w .j av a 2 s. c o m } File indexDir = new File(args[0]); if (indexDir.exists()) { System.out.println("Index directory already exists: " + indexDir.getAbsolutePath()); System.exit(-2); } Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), config); for (Document d : SampleDocumentData.DOCUMENTS_DATA_MINING) { final org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); /* * We will create Lucene documents with searchable "fullContent" field and "title", * "url" and "snippet" fields for clustering. */ doc.add(new TextField("fullContent", d.getSummary(), Store.NO)); doc.add(new TextField("title", d.getTitle(), Store.YES)); doc.add(new TextField("snippet", d.getSummary(), Store.YES)); doc.add(new StringField("url", d.getContentUrl(), Store.YES)); writer.addDocument(doc); } writer.close(); }
From source file:com.sg.business.vault.index.demo.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * // www. j av a 2 s.c o m * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); //$NON-NLS-1$ doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. // doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); try { doc.add(new TextField("contents", FileUtil.getContent(file.getName(), fis), //$NON-NLS-1$ Field.Store.NO)); } catch (Exception e) { e.printStackTrace(); } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); //$NON-NLS-1$ writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); //$NON-NLS-1$ writer.updateDocument(new Term("path", file.getPath()), doc); //$NON-NLS-1$ } } finally { fis.close(); } } } }
From source file:com.shaie.annots.AnnotationSearchExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(dir, conf); // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the // IndexWriterConfig. Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken")); TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); TokenStream colorAnnotationStream = new AnnotatingTokenFilter( textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM); Document doc = new Document(); doc.add(new TextField("text", textStream)); doc.add(new TextField("annot", colorAnnotationStream)); writer.addDocument(doc); writer.close();//from ww w . j av a 2 s .c om DirectoryReader reader = DirectoryReader.open(dir); LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment printFieldTerms(ar, "text"); System.out.println(); final ByteArrayDataInput in = new ByteArrayDataInput(); PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM)); int docID = dape.nextDoc(); int freq = dape.freq(); System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { dape.nextPosition(); BytesRef payload = dape.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } IndexSearcher searcher = new IndexSearcher(reader); System.out.println("\nsearching for 'red WITHIN color':"); Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "red"))); TopDocs td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); System.out.println("\nsearching for 'ate WITHIN color':"); q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "ate"))); td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); reader.close(); dir.close(); }
From source file:com.shaie.annots.example.AnnotatorAnyExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors())); final TokenStream animalsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals())); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()); final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.AnnotatorTokenFilterExample.java
License:Apache License
private static void addDocument(IndexWriter writer, String text) throws IOException { final Document doc = new Document(); doc.add(new TextField(TEXT_FIELD, text, Store.YES)); doc.add(new TextField(COLOR_FIELD, text, Store.NO)); doc.add(new TextField(ANIMAL_FIELD, text, Store.NO)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations);//from w w w. j a v a 2 s . co m final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations)); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc); }
From source file:com.shaie.PhraseVsSpanQuery.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Directory dir = new RAMDirectory(); final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); final IndexWriter writer = new IndexWriter(dir, conf); final Document doc = new Document(); doc.add(new TextField("f", new TokenStream() { final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class); final CharTermAttribute term = addAttribute(CharTermAttribute.class); boolean first = true, done = false; @Override//w w w . jav a 2 s . co m public boolean incrementToken() throws IOException { if (done) { return false; } if (first) { term.setEmpty().append("a"); pos.setPositionIncrement(1); first = false; } else { term.setEmpty().append("b"); pos.setPositionIncrement(0); done = true; } return true; } })); writer.addDocument(doc); writer.close(); final DirectoryReader reader = DirectoryReader.open(dir); final IndexSearcher searcher = new IndexSearcher(reader); final LeafReader ar = reader.leaves().get(0).reader(); final TermsEnum te = ar.terms("f").iterator(); BytesRef scratch = new BytesRef(); while ((scratch = te.next()) != null) { System.out.println(scratch.utf8ToString()); final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString())); System.out.println(" doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition()); } System.out.println(); // try a phrase query with a slop final PhraseQuery pqNoSlop = buildPhraseQuery(0); System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits); final PhraseQuery pqSlop1 = buildPhraseQuery(1); System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits); final PhraseQuery pqSlop3 = buildPhraseQuery(3); System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits); final SpanNearQuery snqUnOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, false); System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = " + searcher.search(snqUnOrdered, 10).totalHits); final SpanNearQuery snqOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, true); System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = " + searcher.search(snqOrdered, 10).totalHits); reader.close(); }