Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:br.andrew.lucene.testing.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from  w  ww  .j  a va2 s. c  om*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(final IndexWriter writer, final File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            final String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    IndexFiles.indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (final FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                final Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                final Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:br.bireme.ngrams.NGrams.java

public static boolean indexDocument(final NGIndex index, final IndexWriter writer, final NGSchema schema,
        final String pipedDoc, final boolean allowDocUpdate) throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }/*from www  .  j a va2s .  c om*/
    if (writer == null) {
        throw new NullPointerException("writer");
    }
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (pipedDoc == null) {
        throw new NullPointerException("pipedDoc");
    }
    boolean ret = false;
    final String pipedDocT = pipedDoc.trim();
    if (!isUtf8Encoding(pipedDocT)) {
        throw new IOException("Invalid encoded string");
    }
    if (!pipedDocT.isEmpty()) {
        final Parameters parameters = schema.getParameters();
        if (Tools.countOccurrences(pipedDoc, '|') < parameters.maxIdxFieldPos) {
            throw new IOException("invalid number of fields: [" + pipedDoc + "]");
        }
        final String pipedDoc2 = StringEscapeUtils.unescapeHtml4(pipedDoc);
        final String[] split = pipedDoc2.replace(':', ' ').trim().split(" *\\| *", Integer.MAX_VALUE);
        final String id = split[parameters.id.pos];
        if (id.isEmpty()) {
            throw new IOException("id");
        }
        final String dbName = split[parameters.db.pos];
        if (dbName.isEmpty()) {
            throw new IOException("dbName");
        }
        final Map<String, br.bireme.ngrams.Field> flds = parameters.nameFields;
        final Document doc = createDocument(flds, split);

        if (doc != null) {
            if (allowDocUpdate) {
                writer.updateDocument(new Term("id", id), doc);
                writer.commit();
            } else {
                writer.addDocument(doc);
            }
            ret = true;
        }
    }
    return ret;
}

From source file:br.com.crawlerspring.model.Searcher.java

private void addDoc(IndexWriter writer, String title, String content) throws IOException {
    org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
    luceneDocument.add(new TextField("title", title, Field.Store.YES));
    luceneDocument.add(new TextField("content", content, Field.Store.YES));
    writer.addDocument(luceneDocument);
}

From source file:buscador.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//from   w  ww  . j a  v  a  2s  . c o  m
 *
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *
 * @param writer Writer to the index where the given file/dir info will be
 * stored
 * @param file The file to index, or the directory to recurse into to find
 * files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));
                insertarenIndice(file, "dc:creator", "creator", doc, "text");
                insertarenIndice(file, "dc:title", "title", doc, "text");
                insertarenIndice(file, "dc:description", "description", doc, "text");
                insertarenIndice(file, "dc:identifier", "identifier", doc, "text");
                insertarenIndice(file, "dc:date", "date", doc, "text");

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:byrne.mitre.main.NameMatcher.java

License:Apache License

private static void loadIndex(String filename, IndexWriter writer) throws IOException {

    BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));

    String line = null;/*ww w  .j  a va  2s.  c  o  m*/
    while ((line = bufferedReader.readLine()) != null) {
        NameEntry entry = new NameEntry(line);
        Document doc = new Document();
        doc.add(new Field("id", entry.getID(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("name", entry.getFullName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("ngrams", new StringReader(entry.getFullName()), Field.TermVector.YES));
        writer.addDocument(doc);

    }
    bufferedReader.close();
}

From source file:bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.GeoNameResolver.java

License:Apache License

/**
 * Index gazetteer's one line data by built-in Lucene Index functions
 *
 * @param indexWriter Lucene indexWriter to be loaded
 * @param line a line from the gazetteer file
 * @throws IOException//from  www .  j  a  va  2 s .  co m
 * @throws NumberFormatException
 */
private void addDoc(IndexWriter indexWriter, final String line, final boolean reverseGeocodingEnabled) {
    String[] tokens = line.split("\t");

    int ID = Integer.parseInt(tokens[0]);
    String name = tokens[1];
    String alternatenames = tokens[3];

    Double latitude = -999999.0;
    try {
        latitude = Double.parseDouble(tokens[4]);
    } catch (NumberFormatException e) {
        latitude = OUT_OF_BOUNDS;
    }
    Double longitude = -999999.0;
    try {
        longitude = Double.parseDouble(tokens[5]);
    } catch (NumberFormatException e) {
        longitude = OUT_OF_BOUNDS;
    }

    int population = 0;
    try {
        population = Integer.parseInt(tokens[14]);
    } catch (NumberFormatException e) {
        population = 0;// Treat as population does not exists
    }

    // Additional fields to rank more known locations higher
    // All available codes can be viewed on www.geonames.org
    String featureCode = tokens[7];// more granular category
    String countryCode = tokens[8];
    String admin1Code = tokens[10];// eg US State
    String admin2Code = tokens[11];// eg county

    Document doc = new Document();
    doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES));
    doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES));
    doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES));
    doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES));
    doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field

    if (reverseGeocodingEnabled) {
        Point point = ctx.makePoint(longitude, latitude);
        for (IndexableField f : strategy.createIndexableFields(point)) {
            doc.add(f);
        }
    }

    try {
        indexWriter.addDocument(doc);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:ca.dracode.ais.indexer.FileIndexer.java

License:Open Source License

/**
 * Creates a Document containing contents and metadata for a specific page of a file
 * @param writer The writer used to save the metadata
 * @param file The file that the page belongs to
 * @param page The index of the page in the file
 * @param contents The string contents of the file
 *///from   w  w w  .  ja  va 2s.c  om
public static void Build(IndexWriter writer, File file, int page, String contents) {
    if (file.canRead()) {
        try {
            //Log.i(TAG, "Started Indexing file: " + file.getName() + " "
            //      + page);
            Document doc = new Document();
            doc.add(new StringField("id", file.getPath() + ":" + page, Field.Store.NO));
            doc.add(new StringField("path", file.getPath(), Field.Store.YES));
            doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));
            // for(int i = 0; i < contents.size(); i++){
            doc.add(new TextField("text", "" + contents, Field.Store.YES));
            doc.add(new IntField("page", page, Field.Store.YES));
            // }
            // TODO - Check what OpenMode.CREATE_OR_APPEND does; I think updateDocument should
            // always be used with CREATE_OR_APPEND, the if part may need to be removed
            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                writer.addDocument(doc);
            } else {
                // TODO - Test UpdateDocument
                writer.updateDocument(new Term("id", file.getPath() + ":" + page), doc);
            }
            Log.i(TAG, "Done Indexing file: " + file.getName() + " " + page);
        } catch (Exception e) {
            Log.e(TAG, "Error ", e);
        }
    }
}

From source file:ca.gnewton.lusql.core.IndexTermFreqCache.java

License:Apache License

/**
 * Describe <code>main</code> method here.
 *
 * @param args a <code>String</code> value
 *//*w  w w  . j  a v  a 2  s. c  o m*/
public static final void main(final String[] args) {
    String dir = "itfcTestIndex";
    String cachedField = "title";
    try {
        IndexWriterConfig config = new IndexWriterConfig(LuSql.luceneVersion,
                new StandardAnalyzer(LuSql.luceneVersion)).setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dir)), config);

        // Doc #1
        Document doc1 = new Document();
        Field title1 = new org.apache.lucene.document.Field(cachedField, "The Rain in Spain is plain",
                Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc1.add(title1);
        org.apache.lucene.document.Field ab1 = new org.apache.lucene.document.Field("ab",
                "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc1.add(ab1);
        writer.addDocument(doc1);

        // Doc #2
        Document doc2 = new Document();
        Field title2 = new org.apache.lucene.document.Field(cachedField, "This is the test plain title",
                Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc2.add(title2);
        org.apache.lucene.document.Field ab2 = new org.apache.lucene.document.Field("ab",
                "This is the test abstract", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        doc2.add(ab2);
        writer.addDocument(doc2);

        writer.close();

        IndexReader reader = IndexReader.open(FSDirectory.open(new File(dir)));
        IndexTermFreqCache cache = new IndexTermFreqCache(reader, cachedField, 100, true);
        System.err.println(cache);
    } catch (Throwable t) {
        t.printStackTrace();
    }

}

From source file:ca.mcgill.cs.creco.logic.search.CategorySearch.java

License:Apache License

private void buildCategoryIndex() throws IOException {
    IndexWriter writer = new IndexWriter(aDirectory, new IndexWriterConfig(VERSION, aAnalyzer));
    for (Category category : aDataStore.getCategories()) {
        String flattenedText = category.getName();
        for (Product product : category.getProducts()) {
            flattenedText += product.getName() + " ";
        }//ww  w.  j a  v a2  s .  c o m
        Document doc = new Document();
        doc.add(new TextField(CATEGORY_ID, category.getId(), Field.Store.YES));
        doc.add(new TextField(CATEGORY_NAME, category.getName(), Field.Store.YES));
        doc.add(new TextField(FLATTENED_TEXT, flattenedText, Field.Store.YES));
        writer.addDocument(doc);
    }
    writer.close();
}

From source file:ca.pgon.freenetknowledge.search.impl.LuceneIndexerThread.java

License:Apache License

private void addEntry(IndexWriter indexWriter, Entry entry) {
    Field refererField;/*from ww  w  .j  a  va  2  s.c  om*/
    if (entry.refererURL != null) {
        refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, String.valueOf(entry.refererURL.getId()),
                Store.YES, Index.ANALYZED);
    } else {
        refererField = new Field(LuceneSearchEngine.INDEX_REFERER_URL, "null", Store.YES, Index.ANALYZED);
    }
    Field forField = new Field(LuceneSearchEngine.INDEX_FOR_URL, String.valueOf(entry.forURL.getId()),
            Store.YES, Index.NO);
    Field contentField = new Field(LuceneSearchEngine.INDEX_CONTENT, entry.content, Store.YES, Index.ANALYZED);

    Document document = new Document();
    document.add(refererField);
    document.add(forField);
    document.add(contentField);

    try {
        indexWriter.addDocument(document);
    } catch (CorruptIndexException e) {
        logger.log(Level.SEVERE, "Description index corrupted", e);
    } catch (IOException e) {
        logger.log(Level.SEVERE, "Description index could not be written", e);
    }
}