Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.edu.lucene.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }// w w  w.j  av  a  2s  . c o  m
            }
        } else {
            System.out.println("adding " + file);
            try {

                writer.addDocument(FileDocument.Document(file));
            }
            // at least on windows, some temporary files raise this exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (FileNotFoundException fnfe) {
                ;
            }
        }
    }
}

From source file:com.ekinoks.lucene.introduction.demos.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*w  ww  .j  a  v a 2 s.  c  o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setOmitTermFreqAndPositions(true);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a NumericField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.emental.mindraider.core.search.SearchCommander.java

License:Apache License

/**
 * Index documents./*  ww  w . j a va 2  s . co  m*/
 * 
 * @param writer
 *            the index writer
 * @param file
 *            the file to write
 * @param rebuildSearchIndexJDialog
 *            the rebuild search index JDialog
 * @throws IOException
 *             the I/O exception
 */
public static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (String filename : files) {
                    indexDocs(writer, new File(file, filename));
                }
            }
        } else {
            StatusBar.setText(action, file.getAbsolutePath(), 70);

            try {
                // I'm interested only in indexing of concepts
                if (file.getAbsolutePath()
                        .indexOf(File.separator + CONCEPTS_DIRECTORY_NAME + File.separator) >= 0) {
                    ConceptResource conceptResource = new ConceptResource(new Resource(file.getAbsolutePath()));

                    // FTS index
                    // TODO parse notebook label from the path for now
                    String notebookUri = conceptResource.getNotebookUri();
                    String notebookLabel;
                    if (notebookUri != null && (notebookUri.indexOf("#") >= 0)) {
                        // TODO from time to time the last letter is killed
                        notebookLabel = notebookUri.substring(notebookUri.indexOf("#") + 1,
                                notebookUri.length());
                        // TODO ugly hack - label must be loaded from the model (slow)
                        notebookLabel = notebookLabel.replaceAll("_", " ");
                    } else {
                        notebookLabel = "Notebook";
                    }

                    // tag (infiltrated)
                    CategoryProperty[] tagsAndFlag = conceptResource.getCategories();
                    if (tagsAndFlag != null && tagsAndFlag.length > 0) {
                        for (CategoryProperty tagOrFlag : tagsAndFlag) {
                            // only tags (not the flag!) are indexed
                            if (tagOrFlag.getCategoryValue() != null
                                    && tagOrFlag.getCategoryValue().length() > 0) {
                                if (!tagOrFlag.getCategoryValue()
                                        .startsWith(MindRaiderConstants.MR_OWL_FLAG_NS)) {
                                    MindRaider.tagCustodian.addOrInc(
                                            new TagEntryImpl(tagOrFlag.getCategoryValue(),
                                                    tagOrFlag.getCategoryCaption(), 1),
                                            new TaggedResourceEntry(notebookUri, notebookLabel,
                                                    conceptResource.getUri(), conceptResource.getLabel(),
                                                    conceptResource.resource.getMetadata().getTimestamp(),
                                                    file.getAbsoluteFile().getAbsolutePath()));
                                }
                            }
                        }
                    }

                    // write it to index
                    writer.addDocument(FileDocument.Document(file, notebookLabel, conceptResource.getLabel(),
                            conceptResource.getUri()));
                }
            } catch (EOFException e) {
                logger.debug("Unable to read file " + file.getAbsolutePath(), e);
            }
            // at least on windows, some temporary files raise this
            // exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (Exception e) {
                logger.debug("File not found!", e);
            }
        }
    }
}

From source file:com.emental.mindraider.core.search.SearchCommander.java

License:Apache License

public static void updateIndex(File conceptFile, String notebookLabel, String conceptLabel, String conceptUri) {
    // TODO the body of this method to by in asynchronous thread
    IndexWriter writer;
    try {/*from   www . ja va2 s  .  c om*/
        writer = new IndexWriter(getSearchIndexPath(), new StandardAnalyzer(), false);
        // update document via concept URI
        logger.debug("UPDATing FTS index for concept: " + conceptFile + " # " + notebookLabel + " # "
                + conceptLabel + " # " + conceptUri); // {{debug}}
        Document document = FileDocument.Document(conceptFile, notebookLabel, conceptLabel, conceptUri);
        writer.deleteDocuments(new Term("uri", conceptUri));
        writer.addDocument(document);
        // TODO removed just for now (before it will be done in async)
        //writer.optimize();
        writer.close();
    } catch (Exception e) {
        logger.debug("Unable to update FTS index", e); // {{debug}}
        // TODO close it in finally
    }
}

From source file:com.epimorphics.server.indexers.LuceneIndex.java

License:Apache License

private void indexEntity(IndexWriter iwriter, boolean update, String graphname, Resource entity)
        throws IOException {
    if (entity.isAnon())
        return;/*  ww  w .  j  av a  2  s.  co  m*/
    Document doc = new Document();
    doc.add(new StringField(FIELD_URI, entity.getURI(), Field.Store.YES));
    doc.add(new StringField(FIELD_GRAPH, graphname, Field.Store.YES));
    StmtIterator si = entity.listProperties();
    while (si.hasNext()) {
        Statement s = si.next();
        Property p = s.getPredicate();
        RDFNode value = s.getObject();
        String valueStr = asString(value);
        if (labelProps.contains(p)) {
            doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES));
            doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO));
        } else if (labelOnlyProps.contains(p)) {
            doc.add(new TextField(p.getURI(), valueStr, Field.Store.NO));
            doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO));
        } else if (valueProps.contains(p) || (indexAll && !ignoreProps.contains(p))) {
            if (value.isURIResource()) {
                doc.add(new StringField(p.getURI(), value.asResource().getURI(), Field.Store.YES));
                // Alternative below would share storage of URIs but only allows per document field
                //                    doc.add( new DerefBytesDocValuesField(p.getURI(), new BytesRef(value.asResource().getURI())) );
            } else if (value.isLiteral()) {
                Literal lvalue = value.asLiteral();
                Object jvalue = lvalue.getValue();
                if (jvalue instanceof Long || jvalue instanceof Integer) {
                    doc.add(new LongField(p.getURI(), ((Number) jvalue).longValue(), Field.Store.YES));
                } else {
                    doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES));
                }
            }
        }
    }
    if (update) {
        iwriter.updateDocument(new Term(FIELD_URI, entity.getURI()), doc);
    } else {
        iwriter.addDocument(doc);
    }
}

From source file:com.example.analyzer.server.database.DbFullTextIndex.java

License:Open Source License

public DbFullTextIndex(DbTable dbTable, int columnOffset) {
    try {/*  w w w.ja v a  2  s .co  m*/
        long beginTime = System.currentTimeMillis();
        ramDirectory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(ramDirectory, new StandardAnalyzer(Version.LUCENE_30),
                new MaxFieldLength(50));
        int rowCount = dbTable.getRowCount();
        for (int rowOffset = 0; rowOffset < rowCount; rowOffset++) {
            String value = dbTable.coalesce(rowOffset, columnOffset, "").toString();
            byte[] idArray = getBytes(rowOffset);
            Document document = new Document();
            document.add(new Field(ID, idArray, Field.Store.YES));
            document.add(new Field(VALUE, value, Store.YES, Index.ANALYZED)); // TODO: Determine whether we need to store value
            writer.addDocument(document);
        }
        writer.optimize();
        writer.close();
        long endTime = System.currentTimeMillis();
        long elapsedTime = endTime - beginTime;
        System.out.println("created index in " + elapsedTime + " ms");
    } catch (CorruptIndexException e) {
        throw new RuntimeException(e);
    } catch (LockObtainFailedException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.example.search.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from  w  ww.j  a  v a  2s. co m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            PageProcess pageProcessor;
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
                pageProcessor = new PageProcess(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }
            WebInfo webInfo;
            try {
                //
                // make a new, empty document
                while ((webInfo = pageProcessor.next()) != null) {
                    Document doc = new Document();
                    //process page
                    // Add the path of the file as a field named "path".  Use a
                    // field that is indexed (i.e. searchable), but don't tokenize 
                    // the field into separate words and don't index term frequency
                    // or positional information:
                    //          Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
                    //          pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                    //          doc.add(pathField);
                    Field urlField = new Field("url", webInfo.url, Field.Store.YES, Field.Index.NO);
                    doc.add(urlField);
                    Field publishidField = new Field("publishid", webInfo.publishid, Field.Store.YES,
                            Field.Index.NO);
                    doc.add(publishidField);
                    Field subjectidField = new Field("subjectid", webInfo.subjectid, Field.Store.YES,
                            Field.Index.NO);
                    doc.add(subjectidField);
                    Field titleField = new Field("title", webInfo.title, Field.Store.YES, Field.Index.NO);
                    doc.add(titleField);
                    doc.add(new Field("keywords", webInfo.keywords, Field.Store.YES, Field.Index.NO));
                    doc.add(new Field("description", webInfo.description, Field.Store.YES, Field.Index.NO));
                    doc.add(new Field("content", webInfo.content, Field.Store.YES, Field.Index.ANALYZED));
                    // Add the last modified date of the file a field named "modified".
                    // Use a NumericField that is indexed (i.e. efficiently filterable with
                    // NumericRangeFilter).  This indexes to milli-second resolution, which
                    // is often too fine.  You could instead create a number based on
                    // year/month/day/hour/minutes/seconds, down the resolution you require.
                    // For example the long value 2011021714 would mean
                    // February 17, 2011, 2-3 PM.
                    //          NumericField modifiedField = new NumericField("modified");
                    //          modifiedField.setLongValue(file.lastModified());
                    //          doc.add(modifiedField);

                    // Add the contents of the file to a field named "contents".  Specify a Reader,
                    // so that the text of the file is tokenized and indexed, but not stored.
                    // Note that FileReader expects the file to be in UTF-8 encoding.
                    // If that's not the case searching for special characters will fail.
                    //          doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {//
                        // New index, so we just add the document (no old document can be there):
                        //            System.out.println("adding " + file);
                        writer.addDocument(doc);
                    } else {
                        // Existing index (an old copy of this document may have been indexed) so 
                        // we use updateDocument instead to replace the old one matching the exact 
                        // path, if present:
                        //            System.out.println("updating " + file);
                        writer.updateDocument(new Term("url", webInfo.url), doc);
                    }
                } //end while
            } finally {
                System.out.println("adding " + file);
                fis.close();
            }
        }
    }
}

From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java

License:Apache License

/**
 * Index a Dictionary/*from w  w w .  ja  va  2  s  . c  o  m*/
 * @param dict the dictionary to index
 * @throws IOException
 */
public void indexDictionary(Dictionary dict) throws IOException {
    IndexWriter.unlock(spellindex);
    IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(),
            !IndexReader.indexExists(spellindex), IndexWriter.MaxFieldLength.UNLIMITED);
    writer.setMergeFactor(300);
    writer.setMaxBufferedDocs(150);

    for (Pair<String, Float> pair : dict) {
        String word = pair.first();
        float boost = pair.last();

        int len = word.length();
        if (len < 3) {
            continue; // too short we bail but "too long" is fine...
        }

        if (this.exist(word)) { // if the word already exist in the gramindex
            continue;
        }

        // ok index the word
        Document doc = createDocument(word, boost, getMin(len), getMax(len));
        writer.addDocument(doc);
    }
    // close writer
    writer.optimize();
    writer.close();

    // close reader
    reader.close();
    reader = null;
}

From source file:com.flycode.CRIBSearch.SearchEngine.Demo.IndexFiles.java

License:Apache License

/**
 * Indexes a single document/*from w w w.  j  a  v  a2s .com*/
 */
private static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.FormBasedXmlQueryDemo.java

License:Apache License

private void openExampleIndex() throws IOException {
    //Create a RAM-based index from our test data file
    RAMDirectory rd = new RAMDirectory();
    IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
    IndexWriter writer = new IndexWriter(rd, iwConfig);
    InputStream dataIn = getServletContext().getResourceAsStream("/WEB-INF/data.tsv");
    BufferedReader br = new BufferedReader(new InputStreamReader(dataIn, IOUtils.CHARSET_UTF_8));
    String line = br.readLine();//from  w w  w.j a v  a 2 s .c  o m
    final FieldType textNoNorms = new FieldType(TextField.TYPE_STORED);
    textNoNorms.setOmitNorms(true);
    while (line != null) {
        line = line.trim();
        if (line.length() > 0) {
            //parse row and create a document
            StringTokenizer st = new StringTokenizer(line, "\t");
            Document doc = new Document();
            doc.add(new Field("location", st.nextToken(), textNoNorms));
            doc.add(new Field("salary", st.nextToken(), textNoNorms));
            doc.add(new Field("type", st.nextToken(), textNoNorms));
            doc.add(new Field("description", st.nextToken(), textNoNorms));
            writer.addDocument(doc);
        }
        line = br.readLine();
    }
    writer.close();

    //open searcher
    // this example never closes it reader!
    IndexReader reader = DirectoryReader.open(rd);
    searcher = new IndexSearcher(reader);
}