Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.tamingtext.frankenstein.Frankenstein.java

License:Apache License

/**
 * Index the content of Frankenstein//w w w.ja v a2 s . com
 *
 * @throws IOException
 */
private void index() throws IOException {
    System.out.println("Indexing Frankenstein");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("frankenstein-gutenberg.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    //let's index paragraphs at a time
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
    directory = new RAMDirectory();
    IndexWriter iw = new IndexWriter(directory, conf);
    String line;
    StringBuilder paraBuffer = new StringBuilder(2048);
    int lines = 0;
    int paragraphs = 0;
    int paragraphLines = 0;
    while ((line = reader.readLine()) != null) {
        if (line.contains("End of the Project Gutenberg")) {//we are in the license section at the end of the book
            break;
        }
        if (line.startsWith("#")) {//skip comments
            continue;
        }
        //if the line is blank, we have a paragraph, so let's index it
        if (line.matches("^\\s*$") && paraBuffer.length() > 0) {
            Document doc = new Document();
            //We can retrieve by paragraph number if we want
            String theString = paraBuffer.toString();
            theString.trim();
            if (theString.length() > 0 && theString.matches("^\\s*$") == false) {
                addMetadata(doc, lines, paragraphs, paragraphLines);
                doc.add(new Field("paragraph", theString, Field.Store.YES, Field.Index.ANALYZED));//add the main content
                iw.addDocument(doc);//Index the document
                paragraphs++;
            }
            //reset some of our state
            paraBuffer.setLength(0);//we are done w/ this paragraph
            paragraphLines = 0;
        } else {
            paraBuffer.append(line).append(' ');
        }
        lines++;
        paragraphLines++;
    }
    System.out.println("Processed " + lines + " lines.  Paragraphs: " + paragraphs);
    iw.close();
}

From source file:com.tamingtext.fuzzy.OverlapMeasures.java

License:Apache License

public TopDocs cosine(String queryTerm, int n, String... terms) throws IOException, ParseException {
    Directory directory = new RAMDirectory();
    final Pattern pattern = Pattern.compile(".");
    Analyzer analyzer = new Analyzer() {
        @Override//from w  w  w  .  ja  v  a 2  s.  co  m
        public TokenStream tokenStream(String fieldName, Reader reader) {
            TokenStream result = null;
            try {
                result = new PatternTokenizer(reader, pattern, 0);
            } catch (IOException e) {
            }
            return result;
        }
    };
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    for (String term : terms) {
        Document doc = new Document();
        doc.add(new Field("chars", term, Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);
    }
    writer.close();
    IndexReader reader = IndexReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), terms.length);
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        System.out.println("Id: " + topDocs.scoreDocs[i].doc + " Val: "
                + searcher.doc(topDocs.scoreDocs[i].doc).get("chars"));
    }
    QueryParser qp = new QueryParser(Version.LUCENE_36, "chars", analyzer);
    Query query = qp.parse(queryTerm);
    return searcher.search(query, n);
}

From source file:com.taobao.common.tedis.support.lucene.analysis.xanalyzer.TestHighLight.java

License:Open Source License

/**
 * @param args/*from w  w  w . j a  v  a2s.c o  m*/
 */
public static void main(String[] args) {

    Directory ramDir = new RAMDirectory();
    try {
        IndexWriter writer = new IndexWriter(ramDir, /*
                                                      * new
                                                      * StandardAnalyzer()/
                                                      */XFactory.getWriterAnalyzer());
        Document doc = new Document();
        Field fd = new Field(FIELD_NAME, CONTENT, Field.Store.YES, Field.Index.TOKENIZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(fd);
        writer.addDocument(doc);
        writer.optimize();
        writer.close();

        IndexReader reader = IndexReader.open(ramDir);
        String queryString = QUERY;
        QueryParser parser = new QueryParser(FIELD_NAME, /*
                                                          * new
                                                          * StandardAnalyzer
                                                          * ()/
                                                          */XFactory.getWriterAnalyzer());
        Query query = parser.parse(queryString);
        System.out.println(query);
        Searcher searcher = new IndexSearcher(ramDir);
        query = query.rewrite(reader);
        System.out.println(query);
        System.out.println("Searching for: " + query.toString(FIELD_NAME));
        Hits hits = searcher.search(query);

        BoldFormatter formatter = new BoldFormatter();
        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(50));
        for (int i = 0; i < hits.length(); i++) {
            String text = hits.doc(i).get(FIELD_NAME);
            int maxNumFragmentsRequired = 5;
            String fragmentSeparator = "...";
            TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), FIELD_NAME);
            TokenStream tokenStream = TokenSources.getTokenStream(tpv);
            /*
             * TokenStream tokenStream2= (new StandardAnalyzer())
             * //XFactory.getWriterAnalyzer() .tokenStream(FIELD_NAME,new
             * StringReader(text));
             *
             * do { Token t = tokenStream2.next(); if(t==null)break;
             * System.out.println("\t" + t.startOffset() + "," +
             * t.endOffset() + "\t" + t.termText()); }while(true);
             */
            String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
                    fragmentSeparator);
            System.out.println("\n" + result);
        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.tekstosense.stemmer.index.Indexer.java

License:Open Source License

/**
 * Adds the doc.//from  ww w.  j  ava  2s. com
 *
 * @param w
 *            the w
 * @param title
 *            the title
 * @param isbn
 *            the isbn
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 */
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
    Document doc = new Document();
    doc.add(new TextField("title", title, Store.YES));
    doc.add(new StringField("isbn", isbn, Store.YES));
    w.addDocument(doc);
}

From source file:com.test.LuceneDemo.java

License:Apache License

@Test
public void test() throws IOException, org.apache.lucene.queryparser.classic.ParseException {
    Analyzer analyzer = new StandardAnalyzer();

    // Store the index in memory:
    Directory directory = new RAMDirectory();
    // To store an index on disk, use this instead:
    //Directory directory = FSDirectory.open("/tmp/testindex");
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter iwriter = new IndexWriter(directory, config);
    Document doc = new Document();
    String text = "This is the text to be indexed.";
    doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
    iwriter.addDocument(doc);
    iwriter.close();//www. j  a v a 2s .  co  m

    // Now search the index:
    DirectoryReader ireader = DirectoryReader.open(directory);
    IndexSearcher isearcher = new IndexSearcher(ireader);
    // Parse a simple query that searches for "text":
    QueryParser parser = new QueryParser("fieldname", analyzer);
    Query query = parser.parse("indexed");
    ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // Iterate through the results:
    for (int i = 0; i < hits.length; i++) {
        Document hitDoc = isearcher.doc(hits[i].doc);
        assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
    }
    ireader.close();
    directory.close();
}

From source file:com.tistory.devyongsik.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* ww  w  .  j a  v a2s  .c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i])); //10.         .
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file); //11.    Stream .
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                //12.   .  Document  Row.
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:

                //13.   Document,  Document     .
                //      .   path   path .
                //          .
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setOmitTermFreqAndPositions(true);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a NumericField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.

                //14. Field        .
                //      ,    .
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.

                //15. path, modified,   contents   Document .
                //       ,    String, Numeric, Reader  
                //         .
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { //16.        add...
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file); //17. Create or Update update .
                    //   3.X   API .
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.vmware.dcp.services.common.LuceneBlobIndexService.java

License:Open Source License

protected void handlePost(Operation post) {
    if (post.isRemote()) {
        post.fail(new IllegalStateException("Remote requests not allowed"));
        return;/*  ww  w  .j  a  v a2 s.c om*/
    }

    Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri());
    String key = params.get(URI_PARAM_NAME_KEY);
    if (key == null) {
        post.fail(new IllegalArgumentException("key query parameter is required"));
        return;
    }

    String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME);

    if (updateTimeParam == null) {
        post.fail(new IllegalArgumentException("update time query parameter is required"));
        return;
    }

    long updateTime = Long.parseLong(updateTimeParam);
    IndexWriter wr = this.writer;
    if (wr == null) {
        post.fail(new CancellationException());
        return;
    }

    try {
        Object content = post.getBodyRaw();
        if (content == null) {
            post.fail(new IllegalArgumentException("service instance is required"));
            return;
        }
        byte[] binaryContent = new byte[this.maxBinaryContextSizeBytes];
        int count = Utils.toBytes(content, binaryContent, 0);
        Document doc = new Document();
        Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count);
        doc.add(binaryContentField);
        Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO);
        doc.add(keyField);

        Field updateTimeField = new LongField(URI_PARAM_NAME_UPDATE_TIME, updateTime, this.longStoredField);
        doc.add(updateTimeField);
        wr.addDocument(doc);
        this.indexUpdateTimeMicros = Utils.getNowMicrosUtc();
        post.setBody(null).complete();
    } catch (Throwable e) {
        logSevere(e);
        post.fail(e);
    }
}

From source file:com.vmware.dcp.services.common.LuceneDocumentIndexService.java

License:Open Source License

private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc)
        throws IOException {
    IndexWriter wr = this.writer;
    if (wr == null) {
        op.fail(new CancellationException());
        return;//from   w  ww.ja  v a2s .c  o m
    }

    long start = Utils.getNowMicrosUtc();
    wr.addDocument(doc);
    updateSelfLinkInfo(sd);

    long end = Utils.getNowMicrosUtc();
    if (hasOption(ServiceOption.INSTRUMENTATION)) {
        ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS);
        setStat(s, end - start);
    }

    op.setBody(null).complete();
    checkDocumentRetentionLimit(sd, desc);

    applyActiveQueries(sd, desc);
}

From source file:com.vmware.xenon.services.common.LuceneBlobIndexService.java

License:Open Source License

public void handlePost(Operation post) {
    if (post.isRemote()) {
        post.fail(new IllegalStateException("Remote requests not allowed"));
        return;/*ww w.  java 2s. c o m*/
    }

    Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri());
    String key = params.get(URI_PARAM_NAME_KEY);
    if (key == null) {
        post.fail(new IllegalArgumentException("key query parameter is required"));
        return;
    }

    String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME);

    if (updateTimeParam == null) {
        post.fail(new IllegalArgumentException("update time query parameter is required"));
        return;
    }

    long updateTime = Long.parseLong(updateTimeParam);
    IndexWriter wr = this.writer;
    if (wr == null) {
        post.fail(new CancellationException());
        return;
    }

    try {
        Object content = post.getBodyRaw();
        if (content == null) {
            post.fail(new IllegalArgumentException("service instance is required"));
            return;
        }

        byte[] binaryContent = getBuffer();
        int count = Utils.toBytes(content, binaryContent, 0);
        Document doc = new Document();
        Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count);
        doc.add(binaryContentField);
        Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO);
        doc.add(keyField);

        LuceneDocumentIndexService.addNumericField(doc, URI_PARAM_NAME_UPDATE_TIME, updateTime, true);

        wr.addDocument(doc);
        this.indexUpdateTimeMicros = Utils.getNowMicrosUtc();
        post.setBody(null).complete();
    } catch (Throwable e) {
        logSevere(e);
        post.fail(e);
    }
}

From source file:com.vmware.xenon.services.common.LuceneDocumentIndexService.java

License:Open Source License

private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc)
        throws IOException {
    IndexWriter wr = this.writer;
    if (wr == null) {
        op.fail(new CancellationException());
        return;/*  w w  w.ja  va 2 s. co m*/
    }

    long start = Utils.getNowMicrosUtc();
    wr.addDocument(doc);
    long end = Utils.getNowMicrosUtc();

    // Use time AFTER index was updated to be sure that it can be compared
    // against the time the searcher was updated and have this change
    // be reflected in the new searcher. If the start time would be used,
    // it is possible to race with updating the searcher and NOT have this
    // change be reflected in the searcher.
    updateLinkAccessTime(end, sd.documentSelfLink);

    if (hasOption(ServiceOption.INSTRUMENTATION)) {
        ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS);
        setStat(s, end - start);
    }

    op.setBody(null).complete();
    checkDocumentRetentionLimit(sd, desc);
    applyActiveQueries(sd, desc);
}