List of usage examples for org.apache.lucene.index IndexWriter updateDocument
private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends IndexableField> doc) throws IOException
From source file:cs571.proj1.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { if (checkForIgnoredFile(file.getFileName().toString())) return;/* w w w.j av a2 s. c om*/ try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. //doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); BufferedReader br = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String line, docNO = null, docID = null, s; StringBuilder sb = new StringBuilder(); Field docIDField = null; Document doc = new Document(); boolean docFound = false; // boolean testOut = false; // PrintWriter pw = new PrintWriter("out.txt"); while ((line = br.readLine()) != null) { line = line.trim(); //To-do detect document boundaries if (line.startsWith(docNO_start)) { docNO = removeTags(line); continue; } if (line.startsWith(docID_start)) { docID = removeTags(line); continue; } // if(checkForIgnoreStartsWith(line)){continue;} // if(checkForIgnoreLine(line)){continue;} if (line.equals(docStart)) { docFound = true; //testOut = true; continue; } if (line.equals(docEnd)) { docFound = false; if (docNO != null) { docIDField = new StringField("docID", docNO, Field.Store.YES); doc.add(docIDField); } else if (docID != null) { docIDField = new StringField("docID", docID, Field.Store.YES); doc.add(docIDField); } else { continue; } //doc.add(new TermVector("contents",sb.toString(),Field.Store.NO)); if (tfidf || bm25) { FieldType tv = new FieldType(); tv.setTokenized(true); tv.setStoreTermVectors(true); tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); s = sb.toString(); byte[] bytes = s.getBytes(); ByteArrayInputStream bstream = new ByteArrayInputStream(bytes); InputStreamReader isr = new InputStreamReader(bstream); doc.add(new Field("contents", isr, tv)); } else { doc.add(new TextField("contents", sb.toString(), Field.Store.NO)); } Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); doc.add(new LongField("modified", lastModified, Field.Store.NO)); numOfDocuments++; sb.setLength(0); // testOut = false; // pw.close(); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } doc = new Document(); continue; } if (docFound) { sb.append(line).append("\n"); // if(testOut) pw.println(line); } } } }
From source file:de.berlios.jhelpdesk.utils.LuceneIndexer.java
License:Open Source License
public synchronized void updateIndexedArticle(Article article) { IndexWriter indexWriter = null; try {/*from ww w. j a va 2 s.c o m*/ Document document = articleToDocument(article); indexWriter = getIndexWriter(); indexWriter.updateDocument(new Term("id", String.valueOf(article.getArticleId())), document); indexWriter.commit(); } catch (Exception ex) { log.error(ex.getMessage(), ex); throw new RuntimeException(ex); } finally { closeWriter(indexWriter); } }
From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java
/** * Indexes a single document/*from w ww . ja v a 2s .co m*/ */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); List<String> lines = new ArrayList<>(); while (reader.ready()) { lines.add(reader.readLine()); } if (lines.size() > 0) { String urlLine = lines.remove(0); if (urlLine != null && urlLine.startsWith("URL:")) { urlLine = urlLine.substring(4); doc.add(new TextField("URL", urlLine, Field.Store.YES)); } } if (lines.size() > 0) { String dataType = lines.remove(0); if (dataType != null && dataType.startsWith("DataType:")) { dataType = dataType.substring(9); doc.add(new TextField("DataType", dataType, Field.Store.YES)); } } if (lines.size() > 0) { String title = lines.remove(0); if (title != null && title.startsWith("Title:")) { title = title.substring(6); doc.add(new TextField("title", title, Field.Store.YES)); } } String content = ""; for (String s : lines) { content = content + s; } doc.add(new TextField("contents", content, Field.Store.NO)); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:de.jetsli.lumeo.util.LuceneHelperTest.java
License:Apache License
@Test public void testTermMatching() throws Exception { RAMDirectory dir = new RAMDirectory(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(RawLucene.VERSION, new KeywordAnalyzer())); Document d = new Document(); FieldType ft = Mapping.getLongFieldType(true, true); d.add(new LongField("id", 1234, ft)); d.add(new LongField("tmp", 1111, ft)); w.addDocument(d);/* w w w . j ava 2s .co m*/ d = new Document(); d.add(new LongField("id", 1234, ft)); d.add(new LongField("tmp", 2222, ft)); w.updateDocument(getTerm("id", 1234), d); d = new Document(); d.add(new LongField("id", 0, ft)); w.addDocument(d); w.commit(); IndexReader reader = DirectoryReader.open(w, true); IndexSearcher searcher = new IndexSearcher(reader); BytesRef bytes = new BytesRef(); NumericUtils.longToPrefixCoded(1234, 0, bytes); TopDocs td = searcher.search(new TermQuery(new Term("id", bytes)), 10); assertEquals(1, td.totalHits); assertEquals(1234L, searcher.doc(td.scoreDocs[0].doc).getField("id").numericValue()); assertEquals(2222L, searcher.doc(td.scoreDocs[0].doc).getField("tmp").numericValue()); w.close(); }
From source file:edu.albany.ir.example.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory./* w ww . ja va 2 s .c o m*/ * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help fnfe.printStackTrace(); return; } try { // our code // ********************************************************* String record = null; int a, b, stringNum = 0, i = 0; // String[] docContents = new String[1000]; // String[] docNos = new String[10000]; String docName = null; // make a new, empty document Document doc = new Document(); BufferedReader reader = new BufferedReader(new InputStreamReader(fis)); record = new String(); while ((record = reader.readLine()) != null) { a = record.lastIndexOf("<DOCNO>"); b = record.indexOf("</DOCNO>"); if (a >= 0 && b > 0) // if this line contains the DOCNO { stringNum++; // docNos[stringNum] = record.substring(a+7,b-1); docName = record.substring(a + 7, b).trim(); // add a document if (stringNum >= 1) { // index previous document if (stringNum >= 2) writer.addDocument(doc); // start new document doc = new Document(); // doc.add(new Field("path", file.getPath()+ // "/"+docName, // Add the path of the file as a field named // "path". Use a // field that is indexed (i.e. searchable), but // don't tokenize // the field into separate words and don't index // term frequency // or positional information: Field pathField = new Field("path", docName, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); // doc.add(new Field("path", docName, // Field.Store.YES, // Field.Index.UN_TOKENIZED)); // System.out.println("adding " + // file.getPath()+ "/"+docName); System.out.println("adding " + docName); // Add the last modified date of the file a // field named "modified". // Use a NumericField that is indexed (i.e. // efficiently filterable with // NumericRangeFilter). This indexes to // milli-second resolution, which // is often too fine. You could instead create a // number based on // year/month/day/hour/minutes/seconds, down the // resolution you require. // For example the long value 2011021714 would // mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // doc.add(new Field("modified", // DateField.timeToString(file.lastModified()), // Field.Store.YES, // Field.Index.UN_TOKENIZED)); } } else { doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized Field.TermVector.YES)); // docContents[stringNum] = docContents[stringNum] + // record; // add contents to document // Add the contents of the file to a field named // "contents". Specify a Reader, // so that the text of the file is tokenized and // indexed, but not stored. // Note that FileReader expects the file to be in // UTF-8 encoding. // If that's not the case searching for special // characters will fail. // doc.add(new Field("contents", new // BufferedReader(new InputStreamReader(fis, // "UTF-8")))); } a = 0; b = 0; } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + docName); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w w w .ja va 2 s. com*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexExplodedXml(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexExplodedXml(writer, new File(file, files[i])); }//from ww w.ja va 2s .c o m } } else { FileInputStream fis = new FileInputStream(file); try { NumericFeatureGenerator nfg = new NumericFeatureGenerator(); if (this.xmlreader == null) { this.xmlreader = new XmlStAXReader(); } Article a = xmlreader.readArticleFromXml(file.getAbsolutePath()); for (Table t : a.tables) { for (Group g : t.groups) { for (Column col : g.columns) { // index columns Document coldoc = new Document(); ArrayList<Double> cfv = nfg.getFeatureVector(col.content); if (cfv.get(0) != null) { DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO); coldoc.add(intratio); } if (cfv.get(1) != null) { DoubleField floatratio = new DoubleField("floatratio", cfv.get(1), Field.Store.NO); coldoc.add(floatratio); } if (cfv.get(3) != null) { DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO); coldoc.add(mean); } if (cfv.get(4) != null) { DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO); coldoc.add(std); } if (cfv.get(6) != null) { DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO); coldoc.add(min); } if (cfv.get(7) != null) { DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO); coldoc.add(max); } if (cfv.get(8) != null) { DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO); coldoc.add(acc); } if (cfv.get(11) != null) { DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO); coldoc.add(colmag); } StringField wholegroup = new StringField("wholegroup", g.toString(), Field.Store.YES); if (wholegroup.stringValue().getBytes().length > 32760) { wholegroup.setStringValue("Table too large..."); System.err.println( "table too large:" + wholegroup.stringValue().getBytes().length); } String headers = ""; if (col.headers != null) { for (Header hdr : col.headers) { headers += hdr.text.toLowerCase() + " "; } } TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO); coldoc.add(header); coldoc.add(wholegroup); StringField fname = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); coldoc.add(fname); StringField type = new StringField("type", "column", Field.Store.YES); coldoc.add(type); IntField bstart = new IntField("bytestart", col.content.get(0).byteStart, Field.Store.YES); IntField bend = new IntField("byteend", col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES); String content = ""; for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) content += f.text + "|"; StringField colcontent = new StringField("colcontent", content.substring(0, content.length() - 1), Field.Store.YES); coldoc.add(colcontent); coldoc.add(bstart); coldoc.add(bend); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(coldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), coldoc); } for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) { // Index single cell Document celldoc = new Document(); ArrayList<Double> fv = nfg.field2Features(f); if (fv.get(0) == 1 || fv.get(0) == 2) { try { DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES); celldoc.add(df); StringField textf = new StringField("text", f.text, Field.Store.YES); celldoc.add(textf); if (fv.get(2) != null & fv.get(2) != Double.NaN) { DoubleField errf = new DoubleField("error", fv.get(2), Field.Store.NO); celldoc.add(errf); } if (fv.get(5) != Double.NaN) { DoubleField magf = new DoubleField("cellmag", fv.get(5), Field.Store.NO); celldoc.add(magf); } if (fv.get(4) != null) { DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4), Field.Store.NO); celldoc.add(pvalue); } StringField sf = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); celldoc.add(sf); StringField ctype = new StringField("type", "cell", Field.Store.YES); celldoc.add(ctype); //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES); //celldoc.add(cwholegroup); IntField cbstart = new IntField("bytestart", f.byteStart, Field.Store.YES); IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES); celldoc.add(cbstart); celldoc.add(cbend); } catch (NullPointerException e) { e.printStackTrace(); System.out.println(f.text); } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(celldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), celldoc); } } } } } } } finally { fis.close(); } } } }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { if (files[i].equals("NeuroScience.num.offset")) indexOffsetAnnotation(writer, new File(file, files[i])); }/*from w ww. j av a2 s.c o m*/ } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { // make a new, empty document Document doc = new Document(); BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)); String line = null; String filename = null; while ((line = br.readLine()) != null) { if (line.trim().length() == 0) { doc.add((new StringField("filename", filename, Field.Store.YES))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } doc = new Document(); filename = null; continue; } String[] spl = line.split("\t"); doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES)); if (filename == null) filename = spl[0]; } br.close(); } finally { fis.close(); } } } }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
protected void updateDocument(Document doc, long studyId) throws IOException { try {//from w ww. jav a 2s.c om IndexWriter writer = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); writer.updateDocument(new Term("id", Long.toString(studyId)), doc); // TODO: // Figure out, eventually, what to do with the variable and file // metadata searches here. // -- L.A. /* * our deleteDocument() method contains these 2 lines, below, * in addition to the deleteDocument() method for the term based on * "id", as above. reader.deleteDocuments(new Term("varStudyId",Long.toString(studyId))); reader.deleteDocuments(new Term("versionStudyId",Long.toString(studyId))); */ writer.commit(); writer.close(); } catch (IOException ex) { ex.printStackTrace(); } }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
protected void updateStudyDocument(long studyId, String field, String value) throws IOException { IndexReader reader = IndexReader.open(dir, false); try {/*www . jav a 2 s . co m*/ if (reader != null) { TermDocs matchingDocuments = reader.termDocs(); if (matchingDocuments != null) { int c = 1; if (matchingDocuments.next()) { // We only expect 1 document when searching by study id. Document studyDocument = reader.document(matchingDocuments.doc()); logger.fine("processing matching document number " + c++); if (studyDocument != null) { logger.fine("got a non-zero doc;"); reader.close(); reader = null; logger.fine("deleted the document;"); //updateDocument(studyDocument, studyId); IndexWriter localWriter = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); localWriter.updateDocument(new Term("id", Long.toString(studyId)), studyDocument); localWriter.commit(); localWriter.close(); logger.fine("wrote the updated version of the document;"); } } } } } catch (IOException ex) { ex.printStackTrace(); } finally { if (reader != null) { reader.close(); } } }