List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.zghw.lucene.demo.FormBasedXmlQueryDemo.java
License:Apache License
private void openExampleIndex() throws IOException { //Create a RAM-based index from our test data file RAMDirectory rd = new RAMDirectory(); IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer); IndexWriter writer = new IndexWriter(rd, iwConfig); InputStream dataIn = getServletContext().getResourceAsStream("/WEB-INF/data.tsv"); BufferedReader br = new BufferedReader(new InputStreamReader(dataIn, StandardCharsets.UTF_8)); String line = br.readLine();/* w ww. j ava 2s . co m*/ final FieldType textNoNorms = new FieldType(TextField.TYPE_STORED); textNoNorms.setOmitNorms(true); while (line != null) { line = line.trim(); if (line.length() > 0) { //parse row and create a document StringTokenizer st = new StringTokenizer(line, "\t"); Document doc = new Document(); doc.add(new Field("location", st.nextToken(), textNoNorms)); doc.add(new Field("salary", st.nextToken(), textNoNorms)); doc.add(new Field("type", st.nextToken(), textNoNorms)); doc.add(new Field("description", st.nextToken(), textNoNorms)); writer.addDocument(doc); } line = br.readLine(); } writer.close(); //open searcher // this example never closes it reader! IndexReader reader = DirectoryReader.open(rd); searcher = new IndexSearcher(reader); }
From source file:com.zghw.lucene.demo.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /* w w w . ja va2 s. c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.zghw.lucene.demo.MultiCategoryListsFacetsExample.java
License:Apache License
/** Build the example index. */ private void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig(FacetExamples.EXAMPLES_VER, new WhitespaceAnalyzer())); // Writes facet ords to a separate directory from the main index DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); Document doc = new Document(); doc.add(new FacetField("Author", "Bob")); doc.add(new FacetField("Publish Date", "2010", "10", "15")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2010", "10", "20")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2012", "1", "1")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Susan")); doc.add(new FacetField("Publish Date", "2012", "1", "7")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Frank")); doc.add(new FacetField("Publish Date", "1999", "5", "5")); indexWriter.addDocument(config.build(taxoWriter, doc)); indexWriter.close();/*from w ww. j ava 2 s . c o m*/ taxoWriter.close(); }
From source file:com.zghw.lucene.demo.RangeFacetsExample.java
License:Apache License
/** Build the example index. */ public void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig(FacetExamples.EXAMPLES_VER, new WhitespaceAnalyzer())); // Add documents with a fake timestamp, 1000 sec before // "now", 2000 sec before "now", ...: for (int i = 0; i < 100; i++) { Document doc = new Document(); long then = nowSec - i * 1000; // Add as doc values field, so we can compute range facets: doc.add(new NumericDocValuesField("timestamp", then)); // Add as numeric field so we can drill-down: doc.add(new LongField("timestamp", then, Field.Store.NO)); indexWriter.addDocument(doc); }/*from w w w .j a va 2s . com*/ // Open near-real-time searcher searcher = new IndexSearcher(DirectoryReader.open(indexWriter, true)); indexWriter.close(); }
From source file:com.zghw.lucene.demo.SimpleSortedSetFacetsExample.java
License:Apache License
/** Build the example index. */ private void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig(FacetExamples.EXAMPLES_VER, new WhitespaceAnalyzer())); Document doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Bob")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2010")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Lisa")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2010")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Lisa")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2012")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Susan")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2012")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Frank")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "1999")); indexWriter.addDocument(config.build(doc)); indexWriter.close();/*w ww .ja v a 2 s.c om*/ }
From source file:com.zsq.lucene.chapter1.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);/*w ww . j a v a2 s.com*/ // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 4 would mean // February 17, 1, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:concurrency.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//w w w . j a v a2 s .com // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, // which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you // require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify // a Reader, // so that the text of the file is tokenized and indexed, but not // stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will // fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can // be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been // indexed) so // we use updateDocument instead to replace the old one matching // the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:control.Search.java
/** * Index all feeds by 'id', 'title', 'link' and 'description' in a static inverted index * //ww w .j a va 2 s . c o m * @param feed Feeds to be indexed */ protected void indexFeed(Feed feed) { try { StandardAnalyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter w = new IndexWriter(index, config); for (int i = 0; i < feed.getItemCount(); i++) { FeedItem item = feed.getItem(i); if (!indexedFeedsIds.contains((String) item.getGUID())) { Document document = new Document(); document.add(new TextField("id", item.getGUID(), Field.Store.YES)); document.add(new TextField("title", item.getTitle(), Field.Store.YES)); document.add(new TextField("link", item.getLink().toString(), Field.Store.YES)); document.add(new TextField("description", item.getDescriptionAsText(), Field.Store.YES)); w.addDocument(document); indexedFeedsIds.add(item.getGUID()); } } w.close(); } catch (IOException ex) { Logger.getLogger(Search.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:cs412.project.search.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * //from w w w .ja v a 2s . co m * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.YES)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): // System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); writer.updateDocument(new Term("modified", Long.toString(file.lastModified())), doc); } } finally { fis.close(); } } } }
From source file:cs571.proj1.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { if (checkForIgnoredFile(file.getFileName().toString())) return;// ww w . j a v a 2 s. com try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. //doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); BufferedReader br = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String line, docNO = null, docID = null, s; StringBuilder sb = new StringBuilder(); Field docIDField = null; Document doc = new Document(); boolean docFound = false; // boolean testOut = false; // PrintWriter pw = new PrintWriter("out.txt"); while ((line = br.readLine()) != null) { line = line.trim(); //To-do detect document boundaries if (line.startsWith(docNO_start)) { docNO = removeTags(line); continue; } if (line.startsWith(docID_start)) { docID = removeTags(line); continue; } // if(checkForIgnoreStartsWith(line)){continue;} // if(checkForIgnoreLine(line)){continue;} if (line.equals(docStart)) { docFound = true; //testOut = true; continue; } if (line.equals(docEnd)) { docFound = false; if (docNO != null) { docIDField = new StringField("docID", docNO, Field.Store.YES); doc.add(docIDField); } else if (docID != null) { docIDField = new StringField("docID", docID, Field.Store.YES); doc.add(docIDField); } else { continue; } //doc.add(new TermVector("contents",sb.toString(),Field.Store.NO)); if (tfidf || bm25) { FieldType tv = new FieldType(); tv.setTokenized(true); tv.setStoreTermVectors(true); tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); s = sb.toString(); byte[] bytes = s.getBytes(); ByteArrayInputStream bstream = new ByteArrayInputStream(bytes); InputStreamReader isr = new InputStreamReader(bstream); doc.add(new Field("contents", isr, tv)); } else { doc.add(new TextField("contents", sb.toString(), Field.Store.NO)); } Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); doc.add(new LongField("modified", lastModified, Field.Store.NO)); numOfDocuments++; sb.setLength(0); // testOut = false; // pw.close(); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } doc = new Document(); continue; } if (docFound) { sb.append(line).append("\n"); // if(testOut) pw.println(line); } } } }