List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.edu.lucene.IndexFiles.java
License:Apache License
static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); }// w w w.j av a 2s . c o m } } else { System.out.println("adding " + file); try { writer.addDocument(FileDocument.Document(file)); } // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help catch (FileNotFoundException fnfe) { ; } } } }
From source file:com.ekinoks.lucene.introduction.demos.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory./*w ww .j a v a 2 s. c o m*/ * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); // Add the last modified date of the file a field named // "modified". // Use a NumericField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.emental.mindraider.core.search.SearchCommander.java
License:Apache License
/** * Index documents./* ww w . j a va 2 s . co m*/ * * @param writer * the index writer * @param file * the file to write * @param rebuildSearchIndexJDialog * the rebuild search index JDialog * @throws IOException * the I/O exception */ public static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (String filename : files) { indexDocs(writer, new File(file, filename)); } } } else { StatusBar.setText(action, file.getAbsolutePath(), 70); try { // I'm interested only in indexing of concepts if (file.getAbsolutePath() .indexOf(File.separator + CONCEPTS_DIRECTORY_NAME + File.separator) >= 0) { ConceptResource conceptResource = new ConceptResource(new Resource(file.getAbsolutePath())); // FTS index // TODO parse notebook label from the path for now String notebookUri = conceptResource.getNotebookUri(); String notebookLabel; if (notebookUri != null && (notebookUri.indexOf("#") >= 0)) { // TODO from time to time the last letter is killed notebookLabel = notebookUri.substring(notebookUri.indexOf("#") + 1, notebookUri.length()); // TODO ugly hack - label must be loaded from the model (slow) notebookLabel = notebookLabel.replaceAll("_", " "); } else { notebookLabel = "Notebook"; } // tag (infiltrated) CategoryProperty[] tagsAndFlag = conceptResource.getCategories(); if (tagsAndFlag != null && tagsAndFlag.length > 0) { for (CategoryProperty tagOrFlag : tagsAndFlag) { // only tags (not the flag!) are indexed if (tagOrFlag.getCategoryValue() != null && tagOrFlag.getCategoryValue().length() > 0) { if (!tagOrFlag.getCategoryValue() .startsWith(MindRaiderConstants.MR_OWL_FLAG_NS)) { MindRaider.tagCustodian.addOrInc( new TagEntryImpl(tagOrFlag.getCategoryValue(), tagOrFlag.getCategoryCaption(), 1), new TaggedResourceEntry(notebookUri, notebookLabel, conceptResource.getUri(), conceptResource.getLabel(), conceptResource.resource.getMetadata().getTimestamp(), file.getAbsoluteFile().getAbsolutePath())); } } } } // write it to index writer.addDocument(FileDocument.Document(file, notebookLabel, conceptResource.getLabel(), conceptResource.getUri())); } } catch (EOFException e) { logger.debug("Unable to read file " + file.getAbsolutePath(), e); } // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help catch (Exception e) { logger.debug("File not found!", e); } } } }
From source file:com.emental.mindraider.core.search.SearchCommander.java
License:Apache License
public static void updateIndex(File conceptFile, String notebookLabel, String conceptLabel, String conceptUri) { // TODO the body of this method to by in asynchronous thread IndexWriter writer; try {/*from www . ja va2 s . c om*/ writer = new IndexWriter(getSearchIndexPath(), new StandardAnalyzer(), false); // update document via concept URI logger.debug("UPDATing FTS index for concept: " + conceptFile + " # " + notebookLabel + " # " + conceptLabel + " # " + conceptUri); // {{debug}} Document document = FileDocument.Document(conceptFile, notebookLabel, conceptLabel, conceptUri); writer.deleteDocuments(new Term("uri", conceptUri)); writer.addDocument(document); // TODO removed just for now (before it will be done in async) //writer.optimize(); writer.close(); } catch (Exception e) { logger.debug("Unable to update FTS index", e); // {{debug}} // TODO close it in finally } }
From source file:com.epimorphics.server.indexers.LuceneIndex.java
License:Apache License
private void indexEntity(IndexWriter iwriter, boolean update, String graphname, Resource entity) throws IOException { if (entity.isAnon()) return;/* ww w . j av a 2 s. co m*/ Document doc = new Document(); doc.add(new StringField(FIELD_URI, entity.getURI(), Field.Store.YES)); doc.add(new StringField(FIELD_GRAPH, graphname, Field.Store.YES)); StmtIterator si = entity.listProperties(); while (si.hasNext()) { Statement s = si.next(); Property p = s.getPredicate(); RDFNode value = s.getObject(); String valueStr = asString(value); if (labelProps.contains(p)) { doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES)); doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO)); } else if (labelOnlyProps.contains(p)) { doc.add(new TextField(p.getURI(), valueStr, Field.Store.NO)); doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO)); } else if (valueProps.contains(p) || (indexAll && !ignoreProps.contains(p))) { if (value.isURIResource()) { doc.add(new StringField(p.getURI(), value.asResource().getURI(), Field.Store.YES)); // Alternative below would share storage of URIs but only allows per document field // doc.add( new DerefBytesDocValuesField(p.getURI(), new BytesRef(value.asResource().getURI())) ); } else if (value.isLiteral()) { Literal lvalue = value.asLiteral(); Object jvalue = lvalue.getValue(); if (jvalue instanceof Long || jvalue instanceof Integer) { doc.add(new LongField(p.getURI(), ((Number) jvalue).longValue(), Field.Store.YES)); } else { doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES)); } } } } if (update) { iwriter.updateDocument(new Term(FIELD_URI, entity.getURI()), doc); } else { iwriter.addDocument(doc); } }
From source file:com.example.analyzer.server.database.DbFullTextIndex.java
License:Open Source License
public DbFullTextIndex(DbTable dbTable, int columnOffset) { try {/* w w w.ja v a 2 s .co m*/ long beginTime = System.currentTimeMillis(); ramDirectory = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDirectory, new StandardAnalyzer(Version.LUCENE_30), new MaxFieldLength(50)); int rowCount = dbTable.getRowCount(); for (int rowOffset = 0; rowOffset < rowCount; rowOffset++) { String value = dbTable.coalesce(rowOffset, columnOffset, "").toString(); byte[] idArray = getBytes(rowOffset); Document document = new Document(); document.add(new Field(ID, idArray, Field.Store.YES)); document.add(new Field(VALUE, value, Store.YES, Index.ANALYZED)); // TODO: Determine whether we need to store value writer.addDocument(document); } writer.optimize(); writer.close(); long endTime = System.currentTimeMillis(); long elapsedTime = endTime - beginTime; System.out.println("created index in " + elapsedTime + " ms"); } catch (CorruptIndexException e) { throw new RuntimeException(e); } catch (LockObtainFailedException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.example.search.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w ww.j a v a 2s. co m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { PageProcess pageProcessor; FileInputStream fis; try { fis = new FileInputStream(file); pageProcessor = new PageProcess(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } WebInfo webInfo; try { // // make a new, empty document while ((webInfo = pageProcessor.next()) != null) { Document doc = new Document(); //process page // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: // Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); // pathField.setIndexOptions(IndexOptions.DOCS_ONLY); // doc.add(pathField); Field urlField = new Field("url", webInfo.url, Field.Store.YES, Field.Index.NO); doc.add(urlField); Field publishidField = new Field("publishid", webInfo.publishid, Field.Store.YES, Field.Index.NO); doc.add(publishidField); Field subjectidField = new Field("subjectid", webInfo.subjectid, Field.Store.YES, Field.Index.NO); doc.add(subjectidField); Field titleField = new Field("title", webInfo.title, Field.Store.YES, Field.Index.NO); doc.add(titleField); doc.add(new Field("keywords", webInfo.keywords, Field.Store.YES, Field.Index.NO)); doc.add(new Field("description", webInfo.description, Field.Store.YES, Field.Index.NO)); doc.add(new Field("content", webInfo.content, Field.Store.YES, Field.Index.ANALYZED)); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // NumericField modifiedField = new NumericField("modified"); // modifiedField.setLongValue(file.lastModified()); // doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. // doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {// // New index, so we just add the document (no old document can be there): // System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: // System.out.println("updating " + file); writer.updateDocument(new Term("url", webInfo.url), doc); } } //end while } finally { System.out.println("adding " + file); fis.close(); } } } }
From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java
License:Apache License
/** * Index a Dictionary/*from w w w . ja va 2 s . c o m*/ * @param dict the dictionary to index * @throws IOException */ public void indexDictionary(Dictionary dict) throws IOException { IndexWriter.unlock(spellindex); IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex), IndexWriter.MaxFieldLength.UNLIMITED); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); for (Pair<String, Float> pair : dict) { String word = pair.first(); float boost = pair.last(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (this.exist(word)) { // if the word already exist in the gramindex continue; } // ok index the word Document doc = createDocument(word, boost, getMin(len), getMax(len)); writer.addDocument(doc); } // close writer writer.optimize(); writer.close(); // close reader reader.close(); reader = null; }
From source file:com.flycode.CRIBSearch.SearchEngine.Demo.IndexFiles.java
License:Apache License
/** * Indexes a single document/*from w w w. j a v a2s .com*/ */ private static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:com.FormBasedXmlQueryDemo.java
License:Apache License
private void openExampleIndex() throws IOException { //Create a RAM-based index from our test data file RAMDirectory rd = new RAMDirectory(); IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer); IndexWriter writer = new IndexWriter(rd, iwConfig); InputStream dataIn = getServletContext().getResourceAsStream("/WEB-INF/data.tsv"); BufferedReader br = new BufferedReader(new InputStreamReader(dataIn, IOUtils.CHARSET_UTF_8)); String line = br.readLine();//from w w w.j a v a 2 s .c o m final FieldType textNoNorms = new FieldType(TextField.TYPE_STORED); textNoNorms.setOmitNorms(true); while (line != null) { line = line.trim(); if (line.length() > 0) { //parse row and create a document StringTokenizer st = new StringTokenizer(line, "\t"); Document doc = new Document(); doc.add(new Field("location", st.nextToken(), textNoNorms)); doc.add(new Field("salary", st.nextToken(), textNoNorms)); doc.add(new Field("type", st.nextToken(), textNoNorms)); doc.add(new Field("description", st.nextToken(), textNoNorms)); writer.addDocument(doc); } line = br.readLine(); } writer.close(); //open searcher // this example never closes it reader! IndexReader reader = DirectoryReader.open(rd); searcher = new IndexSearcher(reader); }