List of usage examples for org.apache.lucene.index IndexWriter updateDocument
private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends IndexableField> doc) throws IOException
From source file:edu.rpi.tw.linkipedia.search.indexing.EntityIndexUpdater.java
License:Open Source License
private void indexDocs(IndexWriter writer, File file) { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { System.out.print(i + " "); indexDocs(writer, new File(file, files[i])); }//from w w w .ja va 2s . c o m } } else { System.out.println("adding " + file); try { FileInputStream fstream = new FileInputStream(file); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; // Set<String> labels = new HashSet<String>(); String subject = ""; HashMap<String, Object> data = new HashMap<String, Object>(); ArrayList<String> triples = new ArrayList<String>(); String contents = ""; int count = 0; while ((line = br.readLine()) != null) { String[] spo = line.split(" ", 3); if (spo.length < 3) { continue; } if (!(spo[0].startsWith("<http") || spo[0].startsWith("_:"))) continue; if (!(spo[1].startsWith("<http") || spo[1].startsWith("_:"))) continue; count++; if (!subject.equals(spo[0])) { if (!subject.equals("")) { data.put("url", subject); data.put("triples", triples); data.put("contents", contents); System.out.println(count + " adding " + subject); Document doc = getDoc(subject, data); writer.updateDocument(new Term("url", subject), doc); } subject = spo[0]; triples = new ArrayList<String>(); contents = ""; } triples.add(spo[1] + " " + spo[2]); //spo[2] = spo[2].toLowerCase(); if (!(spo[2].startsWith("<") || spo[2].startsWith("_"))) { contents += spo[2]; } } //last entity if (!subject.equals("")) { data.put("url", subject); data.put("triples", triples); data.put("contents", contents); System.out.println(count + " adding " + subject); Document doc = getDoc(subject, data); writer.updateDocument(new Term("url", subject), doc); } } catch (Exception e) { e.printStackTrace(); } } } }
From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexUpdater.java
License:Open Source License
private void indexDocs(IndexWriter writer, File file) { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { System.out.print(i + " "); indexDocs(writer, new File(file, files[i])); }//from w w w.j a v a 2s .c om } } else { System.out.println("adding " + file); try { FileInputStream fstream = new FileInputStream(file); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; Set<String> labels = new HashSet<String>(); HashSet<String> defaultLabel = new HashSet<String>(); Set<String> lookUpLabels = new HashSet<String>(); String subject = ""; HashMap<String, Object> data = new HashMap<String, Object>(); int count = 0; while ((line = br.readLine()) != null) { String[] spo = line.split(" ", 3); if (spo.length < 3) { continue; } if (!(spo[0].startsWith("<http") || spo[0].startsWith("_:"))) continue; if (!(spo[1].startsWith("<http") || spo[1].startsWith("_:"))) continue; count++; if (!subject.equals(spo[0])) { if (!subject.equals("")) { data.put("url", subject); data.put("label", labels); data.put("defaultLabel", defaultLabel); data.put("lookUpLabel", lookUpLabels); System.out.println(count + " adding " + subject); Document doc = getDoc(data); writer.updateDocument(new Term("url", subject), doc); } subject = spo[0]; defaultLabel = new HashSet<String>(); labels = new HashSet<String>(); lookUpLabels = new HashSet<String>(); } if (spo[2].equals("")) { continue; } String property = spo[1].toLowerCase(); if (!property.contains("name") && !property.contains("label")) { continue; } /* if(spo[2].matches("u\\d+.*")){ continue; } */ spo[2] = spo[2].toLowerCase(); spo[2] = spo[2] + "|" + getPropertyWeight(spo[1]); if (spo[1].contains("urlName") || spo[1].contains("redirectName")) { defaultLabel.add(spo[2]); } labels.add(spo[2]); String removeSingles = Utils.removeSingleLetter(spo[2]); if (!removeSingles.equals(spo[0])) labels.add(removeSingles); addingLabels(lookUpLabels, spo[2]); //labels.add(reviseString(spo[2])); } //index last entity if (!subject.equals("")) { data.put("url", subject); data.put("label", labels); data.put("defaultLabel", defaultLabel); data.put("lookUpLabel", lookUpLabels); System.out.println(count + " adding " + subject); Document doc = getDoc(data); writer.updateDocument(new Term("url", subject), doc); } } catch (Exception e) { e.printStackTrace(); } } } }
From source file:edu.uci.ics.cs221wiki.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w ww .ja v a 2 s .c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. HTMLTextParser htp = new HTMLTextParser(); htp.HTMLtoTextParser(file.getPath()); String parsedFileName = file.getPath().substring(file.getPath().lastIndexOf("/") + 1, file.getPath().indexOf(".") - 1); parsedFileName = "./output/" + parsedFileName + ".txt"; //System.out.println(file.getPath()); //System.out.println(parsedFileName); FileInputStream parsedFis = new FileInputStream(parsedFileName); doc.add(new Field("contents", new BufferedReader(new InputStreamReader(parsedFis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.uci.ics.searcher.IndexFiles.java
License:Apache License
/** * Add a url and its content to the index. * /*from w w w. j a v a2 s .c o m*/ * @param writer Writer to the index where the given file/dir info will be stored * @param url The url string * @param url_text_path Content file of the url */ static private void addDoc(IndexWriter writer, String url, String docsPath, String fileName) { Document doc = new Document(); try { // add url doc.add(new StringField("url", url, Field.Store.YES)); // add contents FileInputStream fis = new FileInputStream(docsPath + "Textdata/" + fileName); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); // add title String title = HtmlParser.getTitle(docsPath + "Htmldata/" + fileName); doc.add(new TextField("title", title, Field.Store.YES)); // add length File f = new File(docsPath + "Textdata/" + fileName); doc.add(new LongField("length", f.length(), Field.Store.YES)); // Document-level boost //doc.setBoost(1.0f); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + url); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + url); writer.updateDocument(new Term("url", url), doc); } } catch (Exception e) { System.err.println(e.getMessage()); } }
From source file:engine.easy.indexer.EasySearchIndexBuilder.java
License:Apache License
public static void updateDocuments(Map<Integer, Document> docsMap) { try {/*from w ww . ja va 2 s .c o m*/ if (!docsMap.isEmpty()) { Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH)); IndexWriter indexWriter = new IndexWriter(indexDir, new EasySearchAnalyzer(), Boolean.TRUE, MaxFieldLength.UNLIMITED); EasySearchIndexWriter esiWrtier = new EasySearchIndexWriter(indexWriter); if (IndexWriter.isLocked(indexDir)) { IndexWriter.unlock(indexDir); } for (Integer docId : docsMap.keySet()) { Document doc = docsMap.get(docId); indexWriter.updateDocument(new Term("DOCID", docId.toString()), doc); } indexWriter.optimize(); indexWriter.commit(); indexWriter.close(); } } catch (Exception e) { System.out.println("Exception : " + e.toString()); } }
From source file:es.unizar.iaaa.crawler.butler.index.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.// ww w .j a v a 2 s . c o m * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. If there is a low-level I/O error */ public void indexDocs(IndexWriter writer, File file) throws IOException { // file ins salida.txt where the structure is (URI + parsetext)+ /* * Recno:: 0 URL:: http://www.unizar.es/ * * ParseText:: blablabla */ if (file.canRead()) { try (Scanner scan = new Scanner(new FileInputStream(file))) { // make a new, empty document // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. LOGGER.info("adding "); int i = 0; while (scan.hasNextLine()) { String line = scan.nextLine(); if (line.contains("Recno::")) { // fichero String url = scan.nextLine(); scan.nextLine(); scan.nextLine(); String content = scan.nextLine(); url = url.replace("URL:: ", ""); Document doc = new Document(); insertInIndex(url, "url", doc, "text"); insertInIndex(content, "content", doc, "text"); // ya se ha aacbado el fichero if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the // document (no old // document can be there): writer.addDocument(doc); } else { // Existing index (an old copy of this // document may have // been indexed) so // we use updateDocument instead to // replace the old one // matching the exact // path, if present: writer.updateDocument(new Term("path", file.getPath()), doc); } if (i % 100 == 0) LOGGER.info(i + " lines"); i++; } // siguiente linea } // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail LOGGER.info("added " + i); } } }
From source file:fr.paris.lutece.plugins.document.service.docsearch.DocSearchService.java
License:Open Source License
/** * Indexing documents for searching/*from w w w. j a va 2 s . c o m*/ * @param bCreate tell if it's total indexing or total (total = true) * @return indexing logs */ public String processIndexing(boolean bCreate) { StringBuilder sbLogs = new StringBuilder(); IndexWriter writer = null; boolean bCreateIndex = bCreate; try { sbLogs.append("\r\nIndexing all contents ...\r\n"); Directory dir = NIOFSDirectory.open(new File(_strIndex)); if (!DirectoryReader.indexExists(dir)) { //init index bCreateIndex = true; } Date start = new Date(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46, _analyzer); if (bCreateIndex) { conf.setOpenMode(OpenMode.CREATE); } else { conf.setOpenMode(OpenMode.APPEND); } writer = new IndexWriter(dir, conf); if (!bCreateIndex) { //incremental indexing //add all document which must be add for (IndexerAction action : getAllIndexerActionByTask(IndexerAction.TASK_CREATE)) { try { ArrayList<Integer> luceneDocumentId = new ArrayList<Integer>(); luceneDocumentId.add(action.getIdDocument()); List<org.apache.lucene.document.Document> luceneDocument = _indexer .getDocuments(luceneDocumentId); if ((luceneDocument != null) && (luceneDocument.size() > 0)) { Iterator<org.apache.lucene.document.Document> it = luceneDocument.iterator(); while (it.hasNext()) { org.apache.lucene.document.Document doc = it.next(); writer.addDocument(doc); sbLogs.append("Adding "); sbLogs.append(doc.get(DocSearchItem.FIELD_TYPE)); sbLogs.append(" #"); sbLogs.append(doc.get(DocSearchItem.FIELD_UID)); sbLogs.append(" - "); sbLogs.append(doc.get(DocSearchItem.FIELD_TITLE)); sbLogs.append("\r\n"); } } } catch (IOException e) { sbLogs.append("Error durign document indexation parsing."); sbLogs.append("\r\n"); } removeIndexerAction(action.getIdAction()); } //Update all document which must be update for (IndexerAction action : getAllIndexerActionByTask(IndexerAction.TASK_MODIFY)) { try { ArrayList<Integer> luceneDocumentId = new ArrayList<Integer>(); luceneDocumentId.add(action.getIdDocument()); List<org.apache.lucene.document.Document> luceneDocument = _indexer .getDocuments(luceneDocumentId); if ((luceneDocument != null) && (luceneDocument.size() > 0)) { Iterator<org.apache.lucene.document.Document> it = luceneDocument.iterator(); while (it.hasNext()) { org.apache.lucene.document.Document doc = it.next(); writer.updateDocument( new Term(DocSearchItem.FIELD_UID, Integer.toString(action.getIdDocument())), doc); sbLogs.append("Updating "); sbLogs.append(doc.get(DocSearchItem.FIELD_TYPE)); sbLogs.append(" #"); sbLogs.append(doc.get(DocSearchItem.FIELD_UID)); sbLogs.append(" - "); sbLogs.append(doc.get(DocSearchItem.FIELD_TITLE)); sbLogs.append("\r\n"); } } } catch (IOException e) { sbLogs.append("Error durign document indexation parsing."); sbLogs.append("\r\n"); } removeIndexerAction(action.getIdAction()); } //delete all document which must be delete for (IndexerAction action : getAllIndexerActionByTask(IndexerAction.TASK_DELETE)) { writer.deleteDocuments( new Term(DocSearchItem.FIELD_UID, Integer.toString(action.getIdDocument()))); sbLogs.append("Deleting "); sbLogs.append(" #"); sbLogs.append(action.getIdDocument()); sbLogs.append("\r\n"); removeIndexerAction(action.getIdAction()); } } else { //delete all incremental action removeAllIndexerAction(); Collection<Integer> listIdDocuments = DocumentHome.findAllPrimaryKeys(); ArrayList<Integer> luceneDocumentId; for (Integer nIdDocument : listIdDocuments) { try { luceneDocumentId = new ArrayList<Integer>(); luceneDocumentId.add(nIdDocument); List<Document> listDocuments = _indexer.getDocuments(luceneDocumentId); for (Document doc : listDocuments) { writer.addDocument(doc); sbLogs.append("Indexing "); sbLogs.append(doc.get(DocSearchItem.FIELD_TYPE)); sbLogs.append(" #"); sbLogs.append(doc.get(DocSearchItem.FIELD_UID)); sbLogs.append(" - "); sbLogs.append(doc.get(DocSearchItem.FIELD_TITLE)); sbLogs.append("\r\n"); } } catch (IOException e) { sbLogs.append("Error durign document indexation parsing."); sbLogs.append("\r\n"); } } } Date end = new Date(); sbLogs.append("Duration of the treatment : "); sbLogs.append(end.getTime() - start.getTime()); sbLogs.append(" milliseconds\r\n"); } catch (Exception e) { sbLogs.append(" caught a "); sbLogs.append(e.getClass()); sbLogs.append("\n with message: "); sbLogs.append(e.getMessage()); sbLogs.append("\r\n"); AppLogService.error("Indexing error : " + e.getMessage(), e); } finally { try { if (writer != null) { writer.close(); } } catch (IOException e) { AppLogService.error(e.getMessage(), e); } } return sbLogs.toString(); }
From source file:gentest.LuceneTest.java
/** * Testing for the Lucene add tag bug./* w ww. j a v a2s. c o m*/ * * @throws IOException * @throws InterruptedException */ public static void t1() throws IOException, InterruptedException { FSDirectory dir = FSDirectory.open(FileSystems.getDefault().getPath("_lucene_test_")); //Write the document IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new StandardAnalyzer())); Document doc = new Document(); doc.add(new TextField("content", "test\\test", Field.Store.YES)); writer.addDocument(doc); writer.close(); //Add the tag DirectoryReader reader = DirectoryReader.open(dir); writer = new IndexWriter(dir, new IndexWriterConfig(new StandardAnalyzer())); IndexSearcher search = new IndexSearcher(reader); Query q = new TermQuery(new Term("content", "test\\\\test")); TopDocs docs = search.search(q, 1); doc = reader.document(docs.scoreDocs[0].doc); doc.add(new TextField("tag", "tag test", Field.Store.YES)); writer.updateDocument(new Term("content", "test\\test"), doc); writer.close(); reader.close(); doc = null; //Check if the document can still be found reader = DirectoryReader.open(dir); search = new IndexSearcher(reader); TopScoreDocCollector collect = TopScoreDocCollector.create(1); q = new TermQuery(new Term("content", "test")); search.search(q, collect); doc = reader.document(collect.topDocs().scoreDocs[0].doc); for (IndexableField field : doc.getFields()) { System.out.println(field.name() + "\t" + field.stringValue()); } reader.close(); }
From source file:gov.ssa.test.lucenedemo.IndexFiles.java
/** * Indexes a single document/*from w w w . ja v a 2 s .c o m*/ */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:index.IndexEx.java
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { Document doc = new Document(); Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//from w ww. j a v a 2 s .c om doc.add(new LongField("modified", lastModified, Field.Store.NO)); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }