List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:br.andrew.lucene.testing.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w ww .j a va2 s. c om*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(final IndexWriter writer, final File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { final String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { IndexFiles.indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (final FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document final Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: final Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:br.bireme.ngrams.NGrams.java
public static boolean indexDocument(final NGIndex index, final IndexWriter writer, final NGSchema schema, final String pipedDoc, final boolean allowDocUpdate) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }/*from www . j a va2s . c om*/ if (writer == null) { throw new NullPointerException("writer"); } if (schema == null) { throw new NullPointerException("schema"); } if (pipedDoc == null) { throw new NullPointerException("pipedDoc"); } boolean ret = false; final String pipedDocT = pipedDoc.trim(); if (!isUtf8Encoding(pipedDocT)) { throw new IOException("Invalid encoded string"); } if (!pipedDocT.isEmpty()) { final Parameters parameters = schema.getParameters(); if (Tools.countOccurrences(pipedDoc, '|') < parameters.maxIdxFieldPos) { throw new IOException("invalid number of fields: [" + pipedDoc + "]"); } final String pipedDoc2 = StringEscapeUtils.unescapeHtml4(pipedDoc); final String[] split = pipedDoc2.replace(':', ' ').trim().split(" *\\| *", Integer.MAX_VALUE); final String id = split[parameters.id.pos]; if (id.isEmpty()) { throw new IOException("id"); } final String dbName = split[parameters.db.pos]; if (dbName.isEmpty()) { throw new IOException("dbName"); } final Map<String, br.bireme.ngrams.Field> flds = parameters.nameFields; final Document doc = createDocument(flds, split); if (doc != null) { if (allowDocUpdate) { writer.updateDocument(new Term("id", id), doc); writer.commit(); } else { writer.addDocument(doc); } ret = true; } } return ret; }
From source file:br.com.crawlerspring.model.Searcher.java
private void addDoc(IndexWriter writer, String title, String content) throws IOException { org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); luceneDocument.add(new TextField("title", title, Field.Store.YES)); luceneDocument.add(new TextField("content", content, Field.Store.YES)); writer.addDocument(luceneDocument); }
From source file:buscador.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.//from w ww . j a v a 2s . c o m * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be * stored * @param file The file to index, or the directory to recurse into to find * files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.YES)); insertarenIndice(file, "dc:creator", "creator", doc, "text"); insertarenIndice(file, "dc:title", "title", doc, "text"); insertarenIndice(file, "dc:description", "description", doc, "text"); insertarenIndice(file, "dc:identifier", "identifier", doc, "text"); insertarenIndice(file, "dc:date", "date", doc, "text"); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:byrne.mitre.main.NameMatcher.java
License:Apache License
private static void loadIndex(String filename, IndexWriter writer) throws IOException { BufferedReader bufferedReader = new BufferedReader(new FileReader(filename)); String line = null;/*ww w .j a va 2s. c o m*/ while ((line = bufferedReader.readLine()) != null) { NameEntry entry = new NameEntry(line); Document doc = new Document(); doc.add(new Field("id", entry.getID(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("name", entry.getFullName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("ngrams", new StringReader(entry.getFullName()), Field.TermVector.YES)); writer.addDocument(doc); } bufferedReader.close(); }
From source file:bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.GeoNameResolver.java
License:Apache License
/** * Index gazetteer's one line data by built-in Lucene Index functions * * @param indexWriter Lucene indexWriter to be loaded * @param line a line from the gazetteer file * @throws IOException//from www . j a va 2 s . co m * @throws NumberFormatException */ private void addDoc(IndexWriter indexWriter, final String line, final boolean reverseGeocodingEnabled) { String[] tokens = line.split("\t"); int ID = Integer.parseInt(tokens[0]); String name = tokens[1]; String alternatenames = tokens[3]; Double latitude = -999999.0; try { latitude = Double.parseDouble(tokens[4]); } catch (NumberFormatException e) { latitude = OUT_OF_BOUNDS; } Double longitude = -999999.0; try { longitude = Double.parseDouble(tokens[5]); } catch (NumberFormatException e) { longitude = OUT_OF_BOUNDS; } int population = 0; try { population = Integer.parseInt(tokens[14]); } catch (NumberFormatException e) { population = 0;// Treat as population does not exists } // Additional fields to rank more known locations higher // All available codes can be viewed on www.geonames.org String featureCode = tokens[7];// more granular category String countryCode = tokens[8]; String admin1Code = tokens[10];// eg US State String admin2Code = tokens[11];// eg county Document doc = new Document(); doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES)); doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field if (reverseGeocodingEnabled) { Point point = ctx.makePoint(longitude, latitude); for (IndexableField f : strategy.createIndexableFields(point)) { doc.add(f); } } try { indexWriter.addDocument(doc); } catch (IOException e) { e.printStackTrace(); } }
From source file:ca.dracode.ais.indexer.FileIndexer.java
License:Open Source License
/** * Creates a Document containing contents and metadata for a specific page of a file * @param writer The writer used to save the metadata * @param file The file that the page belongs to * @param page The index of the page in the file * @param contents The string contents of the file *///from w w w . ja va 2s.c om public static void Build(IndexWriter writer, File file, int page, String contents) { if (file.canRead()) { try { //Log.i(TAG, "Started Indexing file: " + file.getName() + " " // + page); Document doc = new Document(); doc.add(new StringField("id", file.getPath() + ":" + page, Field.Store.NO)); doc.add(new StringField("path", file.getPath(), Field.Store.YES)); doc.add(new LongField("modified", file.lastModified(), Field.Store.YES)); // for(int i = 0; i < contents.size(); i++){ doc.add(new TextField("text", "" + contents, Field.Store.YES)); doc.add(new IntField("page", page, Field.Store.YES)); // } // TODO - Check what OpenMode.CREATE_OR_APPEND does; I think updateDocument should // always be used with CREATE_OR_APPEND, the if part may need to be removed if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(doc); } else { // TODO - Test UpdateDocument writer.updateDocument(new Term("id", file.getPath() + ":" + page), doc); } Log.i(TAG, "Done Indexing file: " + file.getName() + " " + page); } catch (Exception e) { Log.e(TAG, "Error ", e); } } }
From source file:ca.gnewton.lusql.core.IndexTermFreqCache.java
License:Apache License
/** * Describe <code>main</code> method here. * * @param args a <code>String</code> value *//*w w w . j a v a 2 s. c o m*/ public static final void main(final String[] args) { String dir = "itfcTestIndex"; String cachedField = "title"; try { IndexWriterConfig config = new IndexWriterConfig(LuSql.luceneVersion, new StandardAnalyzer(LuSql.luceneVersion)).setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dir)), config); // Doc #1 Document doc1 = new Document(); Field title1 = new org.apache.lucene.document.Field(cachedField, "The Rain in Spain is plain", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); doc1.add(title1); org.apache.lucene.document.Field ab1 = new org.apache.lucene.document.Field("ab", "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); doc1.add(ab1); writer.addDocument(doc1); // Doc #2 Document doc2 = new Document(); Field title2 = new org.apache.lucene.document.Field(cachedField, "This is the test plain title", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); doc2.add(title2); org.apache.lucene.document.Field ab2 = new org.apache.lucene.document.Field("ab", "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); doc2.add(ab2); writer.addDocument(doc2); writer.close(); IndexReader reader = IndexReader.open(FSDirectory.open(new File(dir))); IndexTermFreqCache cache = new IndexTermFreqCache(reader, cachedField, 100, true); System.err.println(cache); } catch (Throwable t) { t.printStackTrace(); } }
From source file:ca.mcgill.cs.creco.logic.search.CategorySearch.java
License:Apache License
private void buildCategoryIndex() throws IOException { IndexWriter writer = new IndexWriter(aDirectory, new IndexWriterConfig(VERSION, aAnalyzer)); for (Category category : aDataStore.getCategories()) { String flattenedText = category.getName(); for (Product product : category.getProducts()) { flattenedText += product.getName() + " "; }//ww w. j a v a2 s . c o m Document doc = new Document(); doc.add(new TextField(CATEGORY_ID, category.getId(), Field.Store.YES)); doc.add(new TextField(CATEGORY_NAME, category.getName(), Field.Store.YES)); doc.add(new TextField(FLATTENED_TEXT, flattenedText, Field.Store.YES)); writer.addDocument(doc); } writer.close(); }
From source file:ca.pgon.freenetknowledge.search.impl.LuceneIndexerThread.java
License:Apache License
private void addEntry(IndexWriter indexWriter, Entry entry) { Field refererField;/*from ww w .j a va 2 s.c om*/ if (entry.refererURL != null) { refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, String.valueOf(entry.refererURL.getId()), Store.YES, Index.ANALYZED); } else { refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, "null", Store.YES, Index.ANALYZED); } Field forField = new Field(LuceneSearchEngine.INDEX_FOR_URL, String.valueOf(entry.forURL.getId()), Store.YES, Index.NO); Field contentField = new Field(LuceneSearchEngine.INDEX_CONTENT, entry.content, Store.YES, Index.ANALYZED); Document document = new Document(); document.add(refererField); document.add(forField); document.add(contentField); try { indexWriter.addDocument(document); } catch (CorruptIndexException e) { logger.log(Level.SEVERE, "Description index corrupted", e); } catch (IOException e) { logger.log(Level.SEVERE, "Description index could not be written", e); } }