Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:br.andrew.lucene.testing.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from  w  ww  .j  a va2 s. c  om*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(final IndexWriter writer, final File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            final String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    IndexFiles.indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (final FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                final Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                final Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:br.bireme.ngrams.NGrams.java

public static boolean indexDocument(final NGIndex index, final IndexWriter writer, final NGSchema schema,
        final String pipedDoc, final boolean allowDocUpdate) throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }/*from www  .  j a va2s .  c om*/
    if (writer == null) {
        throw new NullPointerException("writer");
    }
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (pipedDoc == null) {
        throw new NullPointerException("pipedDoc");
    }
    boolean ret = false;
    final String pipedDocT = pipedDoc.trim();
    if (!isUtf8Encoding(pipedDocT)) {
        throw new IOException("Invalid encoded string");
    }
    if (!pipedDocT.isEmpty()) {
        final Parameters parameters = schema.getParameters();
        if (Tools.countOccurrences(pipedDoc, '|') < parameters.maxIdxFieldPos) {
            throw new IOException("invalid number of fields: [" + pipedDoc + "]");
        }
        final String pipedDoc2 = StringEscapeUtils.unescapeHtml4(pipedDoc);
        final String[] split = pipedDoc2.replace(':', ' ').trim().split(" *\\| *", Integer.MAX_VALUE);
        final String id = split[parameters.id.pos];
        if (id.isEmpty()) {
            throw new IOException("id");
        }
        final String dbName = split[parameters.db.pos];
        if (dbName.isEmpty()) {
            throw new IOException("dbName");
        }
        final Map<String, br.bireme.ngrams.Field> flds = parameters.nameFields;
        final Document doc = createDocument(flds, split);

        if (doc != null) {
            if (allowDocUpdate) {
                writer.updateDocument(new Term("id", id), doc);
                writer.commit();
            } else {
                writer.addDocument(doc);
            }
            ret = true;
        }
    }
    return ret;
}

From source file:br.com.crawlerspring.model.Searcher.java

private void addDoc(IndexWriter writer, String title, String content) throws IOException {
    org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
    luceneDocument.add(new TextField("title", title, Field.Store.YES));
    luceneDocument.add(new TextField("content", content, Field.Store.YES));
    writer.addDocument(luceneDocument);
}

From source file:buscador.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//from   w  ww  . j a  v  a  2s  . c o  m
 *
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *
 * @param writer Writer to the index where the given file/dir info will be
 * stored
 * @param file The file to index, or the directory to recurse into to find
 * files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));
                insertarenIndice(file, "dc:creator", "creator", doc, "text");
                insertarenIndice(file, "dc:title", "title", doc, "text");
                insertarenIndice(file, "dc:description", "description", doc, "text");
                insertarenIndice(file, "dc:identifier", "identifier", doc, "text");
                insertarenIndice(file, "dc:date", "date", doc, "text");

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:byrne.mitre.main.NameMatcher.java

License:Apache License

private static void loadIndex(String filename, IndexWriter writer) throws IOException {

    BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));

    String line = null;/*ww w  .j  a va  2s.  c  o  m*/
    while ((line = bufferedReader.readLine()) != null) {
        NameEntry entry = new NameEntry(line);
        Document doc = new Document();
        doc.add(new Field("id", entry.getID(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("name", entry.getFullName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("ngrams", new StringReader(entry.getFullName()), Field.TermVector.YES));
        writer.addDocument(doc);

    }
    bufferedReader.close();
}

From source file:bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.GeoNameResolver.java

License:Apache License

/**
 * Index gazetteer's one line data by built-in Lucene Index functions
 *
 * @param indexWriter Lucene indexWriter to be loaded
 * @param line a line from the gazetteer file
 * @throws IOException//from  www .  j  a  va  2 s .  co m
 * @throws NumberFormatException
 */
private void addDoc(IndexWriter indexWriter, final String line, final boolean reverseGeocodingEnabled) {
    String[] tokens = line.split("\t");

    int ID = Integer.parseInt(tokens[0]);
    String name = tokens[1];
    String alternatenames = tokens[3];

    Double latitude = -999999.0;
    try {
        latitude = Double.parseDouble(tokens[4]);
    } catch (NumberFormatException e) {
        latitude = OUT_OF_BOUNDS;
    }
    Double longitude = -999999.0;
    try {
        longitude = Double.parseDouble(tokens[5]);
    } catch (NumberFormatException e) {
        longitude = OUT_OF_BOUNDS;
    }

    int population = 0;
    try {
        population = Integer.parseInt(tokens[14]);
    } catch (NumberFormatException e) {
        population = 0;// Treat as population does not exists
    }

    // Additional fields to rank more known locations higher
    // All available codes can be viewed on www.geonames.org
    String featureCode = tokens[7];// more granular category
    String countryCode = tokens[8];
    String admin1Code = tokens[10];// eg US State
    String admin2Code = tokens[11];// eg county

    Document doc = new Document();
    doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES));
    doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES));
    doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES));
    doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field

    if (reverseGeocodingEnabled) {
        Point point = ctx.makePoint(longitude, latitude);
        for (IndexableField f : strategy.createIndexableFields(point)) {
            doc.add(f);
        }
    }

    try {
        indexWriter.addDocument(doc);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:ca.dracode.ais.indexer.FileIndexer.java

License:Open Source License

/**
 * Creates a Document containing contents and metadata for a specific page of a file
 * @param writer The writer used to save the metadata
 * @param file The file that the page belongs to
 * @param page The index of the page in the file
 * @param contents The string contents of the file
 *///from   w  w w  .  ja  va 2s.c  om
public static void Build(IndexWriter writer, File file, int page, String contents) {
    if (file.canRead()) {
        try {
            //Log.i(TAG, "Started Indexing file: " + file.getName() + " "
            //      + page);
            Document doc = new Document();
            doc.add(new StringField("id", file.getPath() + ":" + page, Field.Store.NO));
            doc.add(new StringField("path", file.getPath(), Field.Store.YES));
            doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));
            // for(int i = 0; i < contents.size(); i++){
            doc.add(new TextField("text", "" + contents, Field.Store.YES));
            doc.add(new IntField("page", page, Field.Store.YES));
            // }
            // TODO - Check what OpenMode.CREATE_OR_APPEND does; I think updateDocument should
            // always be used with CREATE_OR_APPEND, the if part may need to be removed
            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                writer.addDocument(doc);
            } else {
                // TODO - Test UpdateDocument
                writer.updateDocument(new Term("id", file.getPath() + ":" + page), doc);
            }
            Log.i(TAG, "Done Indexing file: " + file.getName() + " " + page);
        } catch (Exception e) {
            Log.e(TAG, "Error ", e);
        }
    }
}

From source file:ca.gnewton.lusql.core.IndexTermFreqCache.java

License:Apache License

/**
 * Describe <code>main</code> method here.
 *
 * @param args a <code>String</code> value
 *//*w  w w  . j  a v  a 2  s. c  o m*/
public static final void main(final String[] args) {
    String dir = "itfcTestIndex";
    String cachedField = "title";
    try {
        IndexWriterConfig config = new IndexWriterConfig(LuSql.luceneVersion,
                new StandardAnalyzer(LuSql.luceneVersion)).setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dir)), config);

        // Doc #1
        Document doc1 = new Document();
        Field title1 = new org.apache.lucene.document.Field(cachedField, "The Rain in Spain is plain",
                Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc1.add(title1);
        org.apache.lucene.document.Field ab1 = new org.apache.lucene.document.Field("ab",
                "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc1.add(ab1);
        writer.addDocument(doc1);

        // Doc #2
        Document doc2 = new Document();
        Field title2 = new org.apache.lucene.document.Field(cachedField, "This is the test plain title",
                Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc2.add(title2);
        org.apache.lucene.document.Field ab2 = new org.apache.lucene.document.Field("ab",
                "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc2.add(ab2);
        writer.addDocument(doc2);

        writer.close();

        IndexReader reader = IndexReader.open(FSDirectory.open(new File(dir)));
        IndexTermFreqCache cache = new IndexTermFreqCache(reader, cachedField, 100, true);
        System.err.println(cache);
    } catch (Throwable t) {
        t.printStackTrace();
    }

}

From source file:ca.mcgill.cs.creco.logic.search.CategorySearch.java

License:Apache License

private void buildCategoryIndex() throws IOException {
    IndexWriter writer = new IndexWriter(aDirectory, new IndexWriterConfig(VERSION, aAnalyzer));
    for (Category category : aDataStore.getCategories()) {
        String flattenedText = category.getName();
        for (Product product : category.getProducts()) {
            flattenedText += product.getName() + " ";
        }//ww  w.  j a  v a2  s .  c o m
        Document doc = new Document();
        doc.add(new TextField(CATEGORY_ID, category.getId(), Field.Store.YES));
        doc.add(new TextField(CATEGORY_NAME, category.getName(), Field.Store.YES));
        doc.add(new TextField(FLATTENED_TEXT, flattenedText, Field.Store.YES));
        writer.addDocument(doc);
    }
    writer.close();
}

From source file:ca.pgon.freenetknowledge.search.impl.LuceneIndexerThread.java

License:Apache License

private void addEntry(IndexWriter indexWriter, Entry entry) {
    Field refererField;/*from ww  w  .j  a  va  2  s.c  om*/
    if (entry.refererURL != null) {
        refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, String.valueOf(entry.refererURL.getId()),
                Store.YES, Index.ANALYZED);
    } else {
        refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, "null", Store.YES, Index.ANALYZED);
    }
    Field forField = new Field(LuceneSearchEngine.INDEX_FOR_URL, String.valueOf(entry.forURL.getId()),
            Store.YES, Index.NO);
    Field contentField = new Field(LuceneSearchEngine.INDEX_CONTENT, entry.content, Store.YES, Index.ANALYZED);

    Document document = new Document();
    document.add(refererField);
    document.add(forField);
    document.add(contentField);

    try {
        indexWriter.addDocument(document);
    } catch (CorruptIndexException e) {
        logger.log(Level.SEVERE, "Description index corrupted", e);
    } catch (IOException e) {
        logger.log(Level.SEVERE, "Description index could not be written", e);
    }
}