List of usage examples for org.apache.lucene.index IndexWriter getConfig
public LiveIndexWriterConfig getConfig()
From source file:FileIndexer.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//from w ww . j a v a 2 s. c om // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:antnlp.opie.indexsearch.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { InputStreamReader iReader = new InputStreamReader(Files.newInputStream(file), StandardCharsets.UTF_8); BufferedReader bufReader = new BufferedReader(iReader); String docLine = null;/*from w w w . ja va 2 s . co m*/ while ((docLine = bufReader.readLine()) != null) { docLine = docLine.trim(); if (docLine.length() == 0) continue; String[] column = docLine.split("\\t"); System.out.println(column[0]); System.out.println(column[1]); // make a new, empty document Document doc = new Document(); // Add the id of the file as a field named "id". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field docidField = new StringField("docid", column[0], Field.Store.YES); doc.add(docidField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", column[1], Field.Store.YES)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + column[0]); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + column[0]); writer.updateDocument(new Term("docid", column[0]), doc); } } iReader.close(); bufReader.close(); }
From source file:aos.lucene.admin.SearcherManager.java
License:Apache License
public SearcherManager(IndexWriter writer) throws IOException { this.writer = writer; IndexReader reader = DirectoryReader.open(writer.getDirectory()); currentSearcher = new IndexSearcher(reader); warm(currentSearcher);// w w w . ja va2 s. com writer.getConfig().setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { public void warm(AtomicReader reader) throws IOException { SearcherManager.this.warm(new IndexSearcher(reader)); } }); }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document/*from ww w . j a va 2 s . co m*/ * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }
From source file:Application.mediaIndexer.java
/** * Indexes a single document// w w w . ja v a2s. c om * * @throws TikaException * @throws SAXException */ public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified) throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = Files.newInputStream(file)) { parser.parse(stream, handler, metadata); Document doc = new Document(); String[] metadataNames = metadata.names(); for (String name : metadataNames) doc.add(new TextField(name, metadata.get(name), Field.Store.YES)); doc.add(new StringField("path", file.toString(), Field.Store.YES)); doc.add(new LongPoint("modified", lastModified)); results.appendText("Title: " + metadata.get("title") + "\n"); results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n"); results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n"); results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n"); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can // be there): results.appendText("adding " + file + "\n"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been // indexed): results.appendText("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java
License:Apache License
/** * Creates the synonym index//w w w .ja v a 2 s .c om * * @throws IOException */ private void indexSKOSModel() throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(indexDir, cfg); writer.getConfig().setRAMBufferSizeMB(48); /* iterate SKOS concepts, create Lucene docs and add them to the index */ ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept); while (concept_iter.hasNext()) { Resource skos_concept = concept_iter.next(); Document concept_doc = createDocumentsFromConcept(skos_concept); writer.addDocument(concept_doc); } writer.close(); }
From source file:at.ac.univie.mminf.luceneSKOS.skos.impl.SKOSEngineImpl.java
License:Apache License
/** * Creates the synonym index//from w w w. ja v a 2 s . c om * * @throws IOException */ private void indexSKOSModel() throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(matchVersion, analyzer); IndexWriter writer = new IndexWriter(indexDir, cfg); writer.getConfig().setRAMBufferSizeMB(48); /* iterate SKOS concepts, create Lucene docs and add them to the index */ ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept); while (concept_iter.hasNext()) { Resource skos_concept = concept_iter.next(); Document concept_doc = createDocumentsFromConcept(skos_concept); // System.out.println("Adding document to index " + concept_doc); writer.addDocument(concept_doc); } writer.close(); }
From source file:back.Indexer.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * // w ww .jav a2 s .com * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:br.andrew.lucene.testing.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w w w . j a v a 2 s . c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(final IndexWriter writer, final File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { final String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { IndexFiles.indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (final FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document final Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: final Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:buscador.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory./* w w w . j ava 2 s .c o m*/ * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be * stored * @param file The file to index, or the directory to recurse into to find * files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.YES)); insertarenIndice(file, "dc:creator", "creator", doc, "text"); insertarenIndice(file, "dc:title", "title", doc, "text"); insertarenIndice(file, "dc:description", "description", doc, "text"); insertarenIndice(file, "dc:identifier", "identifier", doc, "text"); insertarenIndice(file, "dc:date", "date", doc, "text"); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }