List of usage examples for org.apache.lucene.index IndexWriter updateDocument
private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends IndexableField> doc) throws IOException
From source file:index.IndexOmimtsv.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);/* w ww . j ava2s .com*/ // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("20110227030432", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. InputStreamReader ipsr = new InputStreamReader(stream); BufferedReader br = new BufferedReader(ipsr); String line = br.readLine(); int cpt = 0; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split("\t"); if (tokens.length > 6) { String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim(); if (id.matches("^[0-9]*")) { doc = new Document(); cpt++; doc.add(new TextField("ID", id, Field.Store.NO)); if (!tokens[5].trim().matches("^C[0-9].*")) { for (String token : tokens) { if (token.trim().matches("^C[0-9].*")) { doc.add(new StoredField("CUI", token.trim())); break; } } if (doc.getFields().size() != 2) doc.add(new StoredField("CUI", "")); } else doc.add(new StoredField("CUI", tokens[5].trim())); doc.add(new StoredField("Label", tokens[1].trim())); writer.addDocument(doc); } } } System.out.println("Nombre d'lments : " + cpt); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument( new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv", file.toString()), doc); } } }
From source file:indexer.LuceneIndexerAddDocument.java
/** * Indexes a single document with the aid of Apache Tika. * * @param writer Writer to the index where the given file/dir info will be * stored.//from w ww . ja v a 2 s . c o m * @param file The file to index, or the directory to recurse into to find * files to index. * @param attrs This is the attributes from the given file gathered from * walking the file tree. * @param global This is for reference to the global class variables and * methods. * @throws IOException */ static void indexDoc(IndexWriter writer, Path file, BasicFileAttributes attrs, Global global) throws IOException { File document = file.toFile(); if (document.renameTo(document)) { try (InputStream stream = Files.newInputStream(file)) { //make a new, empty document Document doc = new Document(); //Add the path of the file as a field named "path". Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); //Add the last modified date of the file as a field named "modified". doc.add(new LongField("modified", attrs.lastModifiedTime().toMillis(), Field.Store.YES)); //Add the created date of the file as a field named "created". doc.add(new LongField("created", attrs.creationTime().toMillis(), Field.Store.YES)); //Add the document File Name doc.add(new StringField("filename", file.getFileName().toString(), Field.Store.YES)); //Add the contents of the file as a field named "vcontents". //Parser type for Tika BodyContentHandler handler = new BodyContentHandler(global.WRITE_LIMIT); Metadata metadata = new Metadata(); FileInputStream inputstream = new FileInputStream(new File(file.toString())); ParseContext pcontext = new ParseContext(); //New Field Type FieldType bodyType = new FieldType(); bodyType.setStored(true); bodyType.setTokenized(true); // for Highlighter, FastvectorHighlighter bodyType.setStoreTermVectors(true); bodyType.setStoreTermVectorPositions(true); bodyType.setStoreTermVectorOffsets(true); // for PostingsHighlighter bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); /** * Determine the document type and the proper parser for the * document After the document is determined we grab the content * and position offset for highlighting. */ try { if (file.toString().endsWith(".pdf")) { PDFParser pdfparser = new PDFParser(); pdfparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".docx") || file.toString().endsWith(".pptx") || file.toString().endsWith(".xlsx") || file.toString().endsWith(".docm") || file.toString().endsWith(".pptm") || file.toString().endsWith(".xlsm")) { OOXMLParser msofficeparser = new OOXMLParser(); msofficeparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".doc") || file.toString().endsWith(".ppt") || file.toString().endsWith(".xlx")) { OfficeParser msofficeparser = new OfficeParser(); msofficeparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".odt") || file.toString().endsWith(".odp") || file.toString().endsWith(".ods")) { OpenDocumentParser openofficeparser = new OpenDocumentParser(); openofficeparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".epub")) { EpubParser epubParser = new EpubParser(); epubParser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".xml")) { XMLParser XMLparser = new XMLParser(); XMLparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".htm") || file.toString().endsWith(".html") || file.toString().endsWith(".mhtml")) { HtmlParser HTMLparser = new HtmlParser(); HTMLparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".rtf")) { RTFParser RTFparser = new RTFParser(); RTFparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else if (file.toString().endsWith(".txt")) { TXTParser TXTparser = new TXTParser(); TXTparser.parse(inputstream, handler, metadata, pcontext); doc.add(new Field("vcontent", handler.toString(), bodyType)); } else { BufferedReader buffedRead = new BufferedReader( new InputStreamReader(stream, StandardCharsets.UTF_8)); doc.add(new TextField("vcontent", buffedRead)); } } catch (SAXException | TikaException ex) { log.fatal("Document Parsing Exception"); } if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): writer.addDocument(doc); System.out.println("adding " + file); } else { /** * Existing index (an old copy of this document may have * been indexed) so we use updateDocument instead to replace * the old one matching the exact path, if present: */ writer.updateDocument(new Term("path", file.toString()), doc); System.out.println("updating " + file); } } } else { System.out.println("LOCKED: " + file); } }
From source file:InformationRetrieval.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /* www . j a v a 2 s .c om*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); BufferedReader br = new BufferedReader(new FileReader(file)); String L1 = br.readLine(); String L2 = br.readLine(); Field Title = new TextField("Title", L2, Field.Store.YES); Title.setBoost(90F); doc.add(Title); String s1 = null, s2 = null, s3 = null; while (br.readLine() != null) { s1 = br.readLine(); break; } String snip = s1 + " " + s2 + " " + s3; Field Snippet = new StringField("Snippet", snip, Field.Store.YES); doc.add(Snippet); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:io.anserini.index.IndexTweetsUpdatePlace.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("collection").hasArg() .withDescription("source collection directory").create(COLLECTION_OPTION)); options.addOption(/*from w w w . ja v a 2 s . c o m*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); System.out.println(collectionPath + " " + indexPath); LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } final StatusStream stream = new JsonStatusCorpusReader(file); final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION))); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.APPEND); final IndexWriter writer = new IndexWriter(dir, config); System.out.print("Original # of docs " + writer.numDocs()); int updateCount = 0; Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { stream.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { dir.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Shutting down"); } }); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getPlace() != null) { // Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(), // status.getId(), true, true); // System.out.print("Deleting docCount="+writer.numDocs()); // writer.deleteDocuments(q); // writer.commit(); // System.out.print(" Deleted docCount="+writer.numDocs()); Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES)); doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES)); doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } long id = status.getId(); BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCodedBytes(id, 0, brb); Term term = new Term(StatusField.ID.name, brb.get()); writer.updateDocument(term, doc); // writer.addDocument(doc); updateCount += 1; if (updateCount % 10000 == 0) { LOG.info(updateCount + " statuses updated"); writer.commit(); System.out.println("Updated docCount=" + writer.numDocs()); } } } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:io.datalayer.lucene.index.IndexerMain.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.//from w ww. jav a 2s . co m * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. /* * doc.add(new TextField("contents", new BufferedReader(new * InputStreamReader(fis, "UTF-8")), Field.Store.NO)); */ if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): LOGGER.info("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: LOGGER.info("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java
License:Apache License
/** * #A 2 docs in the index//from w ww . ja v a 2 s . c o m * * #B Delete first document * * #C 1 indexed document, 0 deleted documents */ @Test public void testUpdate() throws IOException { assertEquals(1, getHitCount("city", "Amsterdam")); IndexWriter writer = getWriter(); Document doc = new Document(); doc.add(new StoredField("id", "1")); doc.add(new StoredField("country", "Netherlands")); doc.add(new Field("contents", "Den Haag has a lot of museums", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS)); doc.add(new Field("city", "Den Haag", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS)); writer.updateDocument(new Term("id", "1"), doc); writer.close(); assertEquals(0, getHitCount("city", "Amsterdam")); assertEquals(1, getHitCount("city", "Haag")); }
From source file:io.jpress.searcher.LuceneSearcher.java
License:LGPL
@Override public void updateBean(SearcherBean bean) { try {/* w w w . j a v a 2s . c om*/ IndexWriter indexWriter = createIndexWriter(); Term term = new Term("sid", bean.getSid()); indexWriter.updateDocument(term, createDocument(bean)); indexWriter.commit(); indexWriter.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:io.seldon.semvec.CreateLuceneIndexFromDb.java
License:Apache License
private void saveDocument(long id, IndexSearcher reader, IndexWriter writer, BufferedWriter fileWriter) throws CorruptIndexException, IOException { String path;// w w w.j a v a2 s . co m if (rawIds) path = "" + id; else path = sequentialIds ? toSV(this.seqId++) : toSV(id); if (reader != null && appendOnly) { Term docPathTerm = new Term(FIELD_PATH, path); TermQuery tq = new TermQuery(docPathTerm); int hits = reader.search(tq, 1).totalHits; if (hits > 0) { if (debug) System.out.println("Skipping existing doc with id " + id); return; // document exists so don't do anything } } String comments = null; String nlpComments = null; switch (this.extractionMethod) { case COMMENTS: comments = docStore.getComments(id); break; case ITEM_ATTR: if (attrIds != null && attrIds.length > 0) comments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(attrIds))); else if (attrNames != null && attrNames.length > 0) comments = docStore.getItemTextualByName(id, new HashSet<String>(Arrays.asList(attrNames))); else comments = docStore.getItemTextual(id); if (nlpAttrIds != null && nlpAttrIds.length > 0) nlpComments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(nlpAttrIds))); break; case USERS: comments = docStore.getUserItems(id, useItemIds); break; case USER_ACTIONS: comments = docStore.getUserActionAttrs(id, new HashSet<Integer>(Arrays.asList(attrIds))); break; case USER_DIM: comments = docStore.getDimTextual(id, new HashSet<Integer>(Arrays.asList(textAttrIds)), itemLimit); break; } if (comments != null) { if (this.removeHtml) { System.out.println("removing html"); Source source = new Source(comments); comments = source.getTextExtractor().toString(); if (nlpComments != null) { source = new Source(comments); nlpComments = source.getTextExtractor().toString(); } } comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " "); if (transLiterate) { System.out.println("removing punctuation"); comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments); } if (addEntities != null && nlpComments == null) comments = addEntities.process(comments); else if (addEntities != null && nlpComments != null) { nlpComments = addEntities.process(nlpComments); //System.out.println("NLP Comments:["+nlpComments+"]"); //System.out.println("Existing comments:"+comments); comments = comments + " " + nlpComments; } else { comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " "); if (transLiterate) { System.out.println("removing punctuation"); comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments); } } comments = comments.replaceAll("\\|", ""); comments = comments.trim(); String[] tokens = comments.split(" "); if (!"".equals(comments) && tokens.length >= minTokens) { if (debug) System.out.println("adding document for id " + id + " with text:[" + comments + "]"); if (reader != null) { Term docPathTerm = new Term(FIELD_PATH, path); TermQuery tq = new TermQuery(docPathTerm); int hits = reader.search(tq, 1).totalHits; if (hits > 0) // doc exists in index (assumes a unique match...) writer.updateDocument(docPathTerm, createDoc(path, comments)); else writer.addDocument(createDoc(path, comments)); } else writer.addDocument(createDoc(path, comments)); if (fileWriter != null) { if (yahooLDAfile != null) { fileWriter.write("" + id); fileWriter.write(" "); fileWriter.write(path); fileWriter.write(" "); fileWriter.write(comments); fileWriter.write("\n"); } else { fileWriter.write("" + id); fileWriter.write(","); fileWriter.write(comments); fileWriter.write("\n"); } } } else System.out.println("Skipping document with id " + id + " of token length " + tokens.length); } }
From source file:l3.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories * found under the given directory./*from w w w .j a va 2 s . c o m*/ * * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one * document per line, using the <a * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be stored * @param file * The file to index, or the directory to recurse into to find files to index * @throws IOException * If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:lia.chapter2.IndexingTest.java
License:Apache License
@Test public void testUpdate() throws IOException { assertEquals(1, getHitCount("city", "Amsterdam")); IndexWriter writer = getWriter(); Document doc = new Document(); //A doc.add(new Field("id", "1", StringField.TYPE_STORED)); //A doc.add(new Field("country", "Netherlands", StringField.TYPE_STORED)); //A doc.add(new Field("contents", "Den Haag has a lot of museums", TextField.TYPE_STORED)); //A doc.add(new Field("city", "Den Haag", TextField.TYPE_STORED)); //A writer.updateDocument(new Term("id", "1"), //B doc); //B writer.close();// ww w . java2 s.c o m assertEquals(0, getHitCount("city", "Amsterdam"));//C assertEquals(1, getHitCount("city", "Haag")); //D }