Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.edu.lucene.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }// w w  w.j  av  a  2s  . c o  m
            }
        } else {
            System.out.println("adding " + file);
            try {

                writer.addDocument(FileDocument.Document(file));
            }
            // at least on windows, some temporary files raise this exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (FileNotFoundException fnfe) {
                ;
            }
        }
    }
}

From source file:com.ekinoks.lucene.introduction.demos.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*w  ww  .j  a  v a 2 s.  c  o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setOmitTermFreqAndPositions(true);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a NumericField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.emental.mindraider.core.search.SearchCommander.java

License:Apache License

/**
 * Index documents./*  ww  w . j a va 2  s . co  m*/
 * 
 * @param writer
 *            the index writer
 * @param file
 *            the file to write
 * @param rebuildSearchIndexJDialog
 *            the rebuild search index JDialog
 * @throws IOException
 *             the I/O exception
 */
public static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (String filename : files) {
                    indexDocs(writer, new File(file, filename));
                }
            }
        } else {
            StatusBar.setText(action, file.getAbsolutePath(), 70);

            try {
                // I'm interested only in indexing of concepts
                if (file.getAbsolutePath()
                        .indexOf(File.separator + CONCEPTS_DIRECTORY_NAME + File.separator) >= 0) {
                    ConceptResource conceptResource = new ConceptResource(new Resource(file.getAbsolutePath()));

                    // FTS index
                    // TODO parse notebook label from the path for now
                    String notebookUri = conceptResource.getNotebookUri();
                    String notebookLabel;
                    if (notebookUri != null && (notebookUri.indexOf("#") >= 0)) {
                        // TODO from time to time the last letter is killed
                        notebookLabel = notebookUri.substring(notebookUri.indexOf("#") + 1,
                                notebookUri.length());
                        // TODO ugly hack - label must be loaded from the model (slow)
                        notebookLabel = notebookLabel.replaceAll("_", " ");
                    } else {
                        notebookLabel = "Notebook";
                    }

                    // tag (infiltrated)
                    CategoryProperty[] tagsAndFlag = conceptResource.getCategories();
                    if (tagsAndFlag != null && tagsAndFlag.length > 0) {
                        for (CategoryProperty tagOrFlag : tagsAndFlag) {
                            // only tags (not the flag!) are indexed
                            if (tagOrFlag.getCategoryValue() != null
                                    && tagOrFlag.getCategoryValue().length() > 0) {
                                if (!tagOrFlag.getCategoryValue()
                                        .startsWith(MindRaiderConstants.MR_OWL_FLAG_NS)) {
                                    MindRaider.tagCustodian.addOrInc(
                                            new TagEntryImpl(tagOrFlag.getCategoryValue(),
                                                    tagOrFlag.getCategoryCaption(), 1),
                                            new TaggedResourceEntry(notebookUri, notebookLabel,
                                                    conceptResource.getUri(), conceptResource.getLabel(),
                                                    conceptResource.resource.getMetadata().getTimestamp(),
                                                    file.getAbsoluteFile().getAbsolutePath()));
                                }
                            }
                        }
                    }

                    // write it to index
                    writer.addDocument(FileDocument.Document(file, notebookLabel, conceptResource.getLabel(),
                            conceptResource.getUri()));
                }
            } catch (EOFException e) {
                logger.debug("Unable to read file " + file.getAbsolutePath(), e);
            }
            // at least on windows, some temporary files raise this
            // exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (Exception e) {
                logger.debug("File not found!", e);
            }
        }
    }
}

From source file:com.emental.mindraider.core.search.SearchCommander.java

License:Apache License

public static void updateIndex(File conceptFile, String notebookLabel, String conceptLabel, String conceptUri) {
    // TODO the body of this method to by in asynchronous thread
    IndexWriter writer;
    try {/*from   www . ja va2 s  .  c om*/
        writer = new IndexWriter(getSearchIndexPath(), new StandardAnalyzer(), false);
        // update document via concept URI
        logger.debug("UPDATing FTS index for concept: " + conceptFile + " # " + notebookLabel + " # "
                + conceptLabel + " # " + conceptUri); // {{debug}}
        Document document = FileDocument.Document(conceptFile, notebookLabel, conceptLabel, conceptUri);
        writer.deleteDocuments(new Term("uri", conceptUri));
        writer.addDocument(document);
        // TODO removed just for now (before it will be done in async)
        //writer.optimize();
        writer.close();
    } catch (Exception e) {
        logger.debug("Unable to update FTS index", e); // {{debug}}
        // TODO close it in finally
    }
}

From source file:com.epimorphics.server.indexers.LuceneIndex.java

License:Apache License

private void indexEntity(IndexWriter iwriter, boolean update, String graphname, Resource entity)
        throws IOException {
    if (entity.isAnon())
        return;/*  ww  w .  j  av a  2  s.  co  m*/
    Document doc = new Document();
    doc.add(new StringField(FIELD_URI, entity.getURI(), Field.Store.YES));
    doc.add(new StringField(FIELD_GRAPH, graphname, Field.Store.YES));
    StmtIterator si = entity.listProperties();
    while (si.hasNext()) {
        Statement s = si.next();
        Property p = s.getPredicate();
        RDFNode value = s.getObject();
        String valueStr = asString(value);
        if (labelProps.contains(p)) {
            doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES));
            doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO));
        } else if (labelOnlyProps.contains(p)) {
            doc.add(new TextField(p.getURI(), valueStr, Field.Store.NO));
            doc.add(new TextField(FIELD_LABEL, valueStr, Field.Store.NO));
        } else if (valueProps.contains(p) || (indexAll && !ignoreProps.contains(p))) {
            if (value.isURIResource()) {
                doc.add(new StringField(p.getURI(), value.asResource().getURI(), Field.Store.YES));
                // Alternative below would share storage of URIs but only allows per document field
                //                    doc.add( new DerefBytesDocValuesField(p.getURI(), new BytesRef(value.asResource().getURI())) );
            } else if (value.isLiteral()) {
                Literal lvalue = value.asLiteral();
                Object jvalue = lvalue.getValue();
                if (jvalue instanceof Long || jvalue instanceof Integer) {
                    doc.add(new LongField(p.getURI(), ((Number) jvalue).longValue(), Field.Store.YES));
                } else {
                    doc.add(new TextField(p.getURI(), valueStr, Field.Store.YES));
                }
            }
        }
    }
    if (update) {
        iwriter.updateDocument(new Term(FIELD_URI, entity.getURI()), doc);
    } else {
        iwriter.addDocument(doc);
    }
}

From source file:com.example.analyzer.server.database.DbFullTextIndex.java

License:Open Source License

public DbFullTextIndex(DbTable dbTable, int columnOffset) {
    try {/*  w w w.ja v a  2  s .co  m*/
        long beginTime = System.currentTimeMillis();
        ramDirectory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(ramDirectory, new StandardAnalyzer(Version.LUCENE_30),
                new MaxFieldLength(50));
        int rowCount = dbTable.getRowCount();
        for (int rowOffset = 0; rowOffset < rowCount; rowOffset++) {
            String value = dbTable.coalesce(rowOffset, columnOffset, "").toString();
            byte[] idArray = getBytes(rowOffset);
            Document document = new Document();
            document.add(new Field(ID, idArray, Field.Store.YES));
            document.add(new Field(VALUE, value, Store.YES, Index.ANALYZED)); // TODO: Determine whether we need to store value
            writer.addDocument(document);
        }
        writer.optimize();
        writer.close();
        long endTime = System.currentTimeMillis();
        long elapsedTime = endTime - beginTime;
        System.out.println("created index in " + elapsedTime + " ms");
    } catch (CorruptIndexException e) {
        throw new RuntimeException(e);
    } catch (LockObtainFailedException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.example.search.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from  w  ww.j  a  v a  2s. co m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            PageProcess pageProcessor;
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
                pageProcessor = new PageProcess(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }
            WebInfo webInfo;
            try {
                //
                // make a new, empty document
                while ((webInfo = pageProcessor.next()) != null) {
                    Document doc = new Document();
                    //process page
                    // Add the path of the file as a field named "path".  Use a
                    // field that is indexed (i.e. searchable), but don't tokenize 
                    // the field into separate words and don't index term frequency
                    // or positional information:
                    //          Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
                    //          pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                    //          doc.add(pathField);
                    Field urlField = new Field("url", webInfo.url, Field.Store.YES, Field.Index.NO);
                    doc.add(urlField);
                    Field publishidField = new Field("publishid", webInfo.publishid, Field.Store.YES,
                            Field.Index.NO);
                    doc.add(publishidField);
                    Field subjectidField = new Field("subjectid", webInfo.subjectid, Field.Store.YES,
                            Field.Index.NO);
                    doc.add(subjectidField);
                    Field titleField = new Field("title", webInfo.title, Field.Store.YES, Field.Index.NO);
                    doc.add(titleField);
                    doc.add(new Field("keywords", webInfo.keywords, Field.Store.YES, Field.Index.NO));
                    doc.add(new Field("description", webInfo.description, Field.Store.YES, Field.Index.NO));
                    doc.add(new Field("content", webInfo.content, Field.Store.YES, Field.Index.ANALYZED));
                    // Add the last modified date of the file a field named "modified".
                    // Use a NumericField that is indexed (i.e. efficiently filterable with
                    // NumericRangeFilter).  This indexes to milli-second resolution, which
                    // is often too fine.  You could instead create a number based on
                    // year/month/day/hour/minutes/seconds, down the resolution you require.
                    // For example the long value 2011021714 would mean
                    // February 17, 2011, 2-3 PM.
                    //          NumericField modifiedField = new NumericField("modified");
                    //          modifiedField.setLongValue(file.lastModified());
                    //          doc.add(modifiedField);

                    // Add the contents of the file to a field named "contents".  Specify a Reader,
                    // so that the text of the file is tokenized and indexed, but not stored.
                    // Note that FileReader expects the file to be in UTF-8 encoding.
                    // If that's not the case searching for special characters will fail.
                    //          doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {//
                        // New index, so we just add the document (no old document can be there):
                        //            System.out.println("adding " + file);
                        writer.addDocument(doc);
                    } else {
                        // Existing index (an old copy of this document may have been indexed) so 
                        // we use updateDocument instead to replace the old one matching the exact 
                        // path, if present:
                        //            System.out.println("updating " + file);
                        writer.updateDocument(new Term("url", webInfo.url), doc);
                    }
                } //end while
            } finally {
                System.out.println("adding " + file);
                fis.close();
            }
        }
    }
}

From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java

License:Apache License

/**
 * Index a Dictionary/*from w  w w .  ja  va  2  s  . c  o  m*/
 * @param dict the dictionary to index
 * @throws IOException
 */
public void indexDictionary(Dictionary dict) throws IOException {
    IndexWriter.unlock(spellindex);
    IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(),
            !IndexReader.indexExists(spellindex), IndexWriter.MaxFieldLength.UNLIMITED);
    writer.setMergeFactor(300);
    writer.setMaxBufferedDocs(150);

    for (Pair<String, Float> pair : dict) {
        String word = pair.first();
        float boost = pair.last();

        int len = word.length();
        if (len < 3) {
            continue; // too short we bail but "too long" is fine...
        }

        if (this.exist(word)) { // if the word already exist in the gramindex
            continue;
        }

        // ok index the word
        Document doc = createDocument(word, boost, getMin(len), getMax(len));
        writer.addDocument(doc);
    }
    // close writer
    writer.optimize();
    writer.close();

    // close reader
    reader.close();
    reader = null;
}

From source file:com.flycode.CRIBSearch.SearchEngine.Demo.IndexFiles.java

License:Apache License

/**
 * Indexes a single document/*from w w w.  j  a  v  a2s .com*/
 */
private static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.FormBasedXmlQueryDemo.java

License:Apache License

private void openExampleIndex() throws IOException {
    //Create a RAM-based index from our test data file
    RAMDirectory rd = new RAMDirectory();
    IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
    IndexWriter writer = new IndexWriter(rd, iwConfig);
    InputStream dataIn = getServletContext().getResourceAsStream("/WEB-INF/data.tsv");
    BufferedReader br = new BufferedReader(new InputStreamReader(dataIn, IOUtils.CHARSET_UTF_8));
    String line = br.readLine();//from  w w  w.j a v  a 2 s .c  o m
    final FieldType textNoNorms = new FieldType(TextField.TYPE_STORED);
    textNoNorms.setOmitNorms(true);
    while (line != null) {
        line = line.trim();
        if (line.length() > 0) {
            //parse row and create a document
            StringTokenizer st = new StringTokenizer(line, "\t");
            Document doc = new Document();
            doc.add(new Field("location", st.nextToken(), textNoNorms));
            doc.add(new Field("salary", st.nextToken(), textNoNorms));
            doc.add(new Field("type", st.nextToken(), textNoNorms));
            doc.add(new Field("description", st.nextToken(), textNoNorms));
            writer.addDocument(doc);
        }
        line = br.readLine();
    }
    writer.close();

    //open searcher
    // this example never closes it reader!
    IndexReader reader = DirectoryReader.open(rd);
    searcher = new IndexSearcher(reader);
}