Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

License:Apache License

public void create_ngram_index(File ngram_joined_counts_file) throws IOException {
    File index_dir = new File(_index_dir, "ngram");
    if (index_dir.exists()) {
        LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();//from   w w  w  .  ja  v a2  s. c  o  m
        } else
            return;
    }
    index_dir.mkdirs();

    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    // use 80 percent of the available total memory
    double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6;
    double percentage_ram_buffer = Properties.ramBufferPercentage();
    if (percentage_ram_buffer > 0) {
        double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer;
        LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)",
                percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb));
        iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb);
    }

    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_ngram = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(ngram_joined_counts_file);
    if (ngram_joined_counts_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));

    Document doc = new Document();
    Field f_ngram = new StringField("ngram", "", Store.YES);
    doc.add(f_ngram);
    Field f_n = new IntField("cardinality", 0, Store.YES);
    doc.add(f_n);
    Field f_word = new StringField("word", "", Store.YES);
    doc.add(f_word);
    Field f_hist = new StringField("history", "", Store.YES);
    doc.add(f_hist);
    Field f_lower = new StringField("lower", "", Store.YES);
    doc.add(f_lower);
    Field f_count = new StoredField("num", 0L);
    doc.add(f_count);

    Field[] f_follow = new Field[4];
    f_follow[0] = new StoredField("nf_s", 0L);
    doc.add(f_follow[0]);
    f_follow[1] = new StoredField("nf_N1", 0L);
    doc.add(f_follow[1]);
    f_follow[2] = new StoredField("nf_N2", 0L);
    doc.add(f_follow[2]);
    f_follow[3] = new StoredField("nf_N3", 0L);
    doc.add(f_follow[3]);
    Field[] f_precede = new Field[4];
    f_precede[0] = new StoredField("np_s", 0L);
    doc.add(f_precede[0]);
    f_precede[1] = new StoredField("np_N1", 0L);
    doc.add(f_precede[1]);
    f_precede[2] = new StoredField("np_N2", 0L);
    doc.add(f_precede[2]);
    f_precede[3] = new StoredField("np_N3", 0L);
    doc.add(f_precede[3]);
    Field[] f_followerprecede = new Field[4];
    f_followerprecede[0] = new StoredField("nfp_s", 0L);
    doc.add(f_followerprecede[0]);
    f_followerprecede[1] = new StoredField("nfp_N1", 0L);
    doc.add(f_followerprecede[1]);
    f_followerprecede[2] = new StoredField("nfp_N2", 0L);
    doc.add(f_followerprecede[2]);
    f_followerprecede[3] = new StoredField("nfp_N3", 0L);
    doc.add(f_followerprecede[3]);

    Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } };
    Long[] S = new Long[] { 0L };
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 100000 == 0)
            LOG.info("Adding {}'th ngram.", c);
        String line = iter.next();
        try {
            String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t");
            String ngram_str = splits[0];
            if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) {
                LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line,
                        ngram_joined_counts_file);
                continue;
            }

            List<String> ngram = Arrays.asList(ngram_str.split(" "));
            long num = Long.parseLong(splits[1]);
            int n = ngram.size();

            f_ngram.setStringValue(ngram_str);
            f_n.setIntValue(n);
            f_word.setStringValue(ngram.get(ngram.size() - 1));
            f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " "));
            f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " "));
            f_count.setLongValue(num);

            for (int j = 0; j < f_follow.length; j++) {
                f_follow[j].setLongValue(0L);
                f_precede[j].setLongValue(0L);
                f_followerprecede[j].setLongValue(0L);
            }

            if (splits.length > 2 && !splits[2].isEmpty()) {
                // precede or follow or followerprecede
                String[] splits_ = splits[2].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 3 && !splits[3].isEmpty()) {
                // should be follow or followerprecede
                String[] splits_ = splits[3].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 4 && !splits[4].isEmpty()) {
                // should be followerprecede
                String[] splits_ = splits[4].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }

            writer_ngram.addDocument(doc);

            while (N.length <= n) {
                N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } });
                S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L });
            }

            if (num == 1L)
                N[n][1]++;
            else if (num == 2L)
                N[n][2]++;
            else if (num == 3L)
                N[n][3]++;
            else if (num == 4L)
                N[n][4]++;
            else
                N[n][5]++;
            N[n][0]++;
            S[n] += num;

        } catch (Exception e) {
            LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line,
                    ngram_joined_counts_file, c, e);
        }
    }

    writer_ngram.forceMergeDeletes();
    writer_ngram.commit();
    writer_ngram.close();

    StringBuilder b = new StringBuilder(String.format(
            "#%n# Number of times where an ngram occurred: %n#  at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n",
            N.length - 1));
    for (int n = 1; n < N.length; n++)
        b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ',')));
    for (int n = 1; n < S.length; n++)
        b.append(String.format("s%d=%d%n", n, S[n]));
    FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString());

}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

License:Apache License

public void create_vocabulary_index(File vocabulary_file) throws IOException {
    File index_dir = new File(_index_dir, "vocab");
    if (index_dir.exists()) {
        LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();//from   www .  j a v  a 2s .  c  o m
        } else
            return;
    }
    index_dir.mkdirs();
    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(1024.0);
    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_vocab = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(vocabulary_file);
    if (vocabulary_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));
    Document doc = new Document();
    Field f_word = new StringField("word", "", Field.Store.YES);
    doc.add(f_word);
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 10000 == 0)
            LOG.info("Adding {}'th word.", c);
        String line = iter.next();
        try {
            String word = line.trim();
            f_word.setStringValue(word);
            writer_vocab.addDocument(doc);
        } catch (Exception e) {
            LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e);
        }
    }

    writer_vocab.forceMergeDeletes();
    writer_vocab.commit();
    writer_vocab.close();
}

From source file:de.u808.simpleinquest.indexer.impl.IndexUpdater.java

License:Apache License

private void indexDocuments(List<File> files)
        throws CorruptIndexException, LockObtainFailedException, IOException {
    IndexWriter indexWriter = new IndexWriter(indexDirectory, new StandardAnalyzer());
    Iterator<File> iterator = files.iterator();
    while (iterator.hasNext()) {
        File file = (File) iterator.next();
        if (file.isDirectory()) {
            Document doc = DirectoryDocument.Document(file);
            indexWriter.addDocument(doc);
        } else {/*from ww  w .  ja  va 2  s.  co m*/
            Indexer indexer = indexerFactory.getIndexer(file);
            if (indexer != null) {
                Document document = null;
                try {
                    log.debug("Memory before indexing in MB (M: "
                            + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: "
                            + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + "F: "
                            + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")");
                    this.ensureEnoughHeapMemory();
                    String msg = "Indexing file: " + file.getPath();
                    document = indexer.indexFile(file);
                    this.setStatusMessage(msg);
                    log.info(msg);
                    log.debug("Memory after indexing in MB (M: "
                            + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: "
                            + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + " F: "
                            + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")");
                    iterator.remove();
                } catch (IndexerException e) {
                    log.error("Error during indexing", e);
                } catch (OutOfMemoryError outOfMemoryError) {
                    log.warn(
                            "File seems to be to big for the actual free heap. Try to increase availible memory with vm option -Xmx if this is a recurring error message");
                    log.info("Try to free memory");
                    document = null;
                    System.gc();
                    this.refreschIndex();
                }
                if (document != null) {
                    indexWriter.addDocument(document);
                } else {
                    String msg = "Indexer " + indexer.getClass() + " returned no content to index";
                    this.setStatusMessage(msg);
                    log.warn(msg);
                }
            } else {
                log.debug("No indexer for file: " + file.getPath());
            }
        }
    }
    String msg = "Optimizing index";
    this.setStatusMessage(msg);
    log.info(msg);
    indexWriter.flush();
    indexWriter.optimize();
    msg = "Index optimized";
    this.setStatusMessage(msg);
    log.info(msg);
    indexWriter.close(true);
    indexWriter = null;
}

From source file:de.unidue.inf.is.ezdl.dlservices.search.handlers.ranking.LuceneRanker.java

License:Open Source License

private void createIndex(ResultDocumentList toRank, IndexWriter indexWriter)
        throws CorruptIndexException, IOException {
    for (ResultDocument result : toRank) {
        Document document = result.getDocument();
        org.apache.lucene.document.Document d = new org.apache.lucene.document.Document();

        StringBuilder sb = new StringBuilder();

        String oid = document.getOid();

        Field.Store store = Field.Store.NO;

        Field field;/* w w  w .  j ava2 s  .c  om*/
        if (!StringUtils.isEmpty(oid)) {
            field = new Field("oid", oid, Field.Store.YES, Field.Index.NO);
            d.add(field);
            String title = document.getTitle();
            if (!StringUtils.isEmpty(title)) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TITLE.toString(), title, store,
                        Field.Index.ANALYZED);
                field.setOmitNorms(true);
                field.setBoost(2.0f);
                d.add(field);
                sb.append(title);
                sb.append(" ");
            }
            if (document instanceof TextDocument) {
                String docAbstract = ((TextDocument) document).getAbstract();
                if (!StringUtils.isEmpty(docAbstract)) {
                    field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.ABSTRACT.toString(),
                            docAbstract, store, Field.Index.ANALYZED);
                    d.add(field);
                    sb.append(docAbstract);
                    sb.append(" ");
                }
            }
            int year = document.getYear();
            if (year != 0) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.YEAR.toString(),
                        String.valueOf(year), store, Field.Index.NOT_ANALYZED);
                d.add(field);
                sb.append(" ");
                sb.append(year);
            }
            PersonList authorList = document.getAuthorList();
            if (authorList != null) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.AUTHOR.toString(),
                        authorList.toString(), store, Field.Index.ANALYZED);
                d.add(field);
                sb.append(authorList.toString());
            }
            field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TEXT.toString(),
                    sb.toString().toString(), store, Field.Index.ANALYZED);
            d.add(field);

            indexWriter.addDocument(d);
        }
    }
    indexWriter.commit();
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java

License:Apache License

private int indexDocs(final IndexWriter writer, final Iterator<LexEntry> iterator) throws IOException {
    int counter = 0;
    NumberFormat nf = NumberFormat.getNumberInstance();
    while (iterator.hasNext()) {
        LexEntry lexEntry = iterator.next();
        List<Document> docs = createDocument(lexEntry);
        if (tracing) {
            logger.trace("Indexing Documents: " + docs);
        }/*from w  w  w. j av  a2s.com*/
        for (Document doc : docs) {
            writer.addDocument(doc);
        }
        counter++;
        if (counter % 10000 == 0) {
            logger.debug("Indexed " + nf.format(counter) + " documents.");
        }
    }
    logger.info("###########################################");
    logger.info("Indexing completed - " + nf.format(counter) + " entries have been indexed.");
    logger.info("###########################################");
    return counter;
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java

License:Apache License

void update(LexEntry entry) throws IOException {
    IndexWriter writer = initIndexWriter();
    Term queryTerm = new Term(LexEntry.ID, entry.getId());
    writer.deleteDocuments(queryTerm);/*from w  w w  . ja va  2s.  com*/
    if (entry.getCurrent() != null) {
        List<Document> docs = createDocument(entry);
        for (Document document : docs) {
            writer.addDocument(document);
        }
    }
    writer.close();
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryLoader.java

License:Apache License

void update(LexEntry entry) throws IOException {
    IndexWriter writer = initIndexWriter();
    Term queryTerm = new Term(LexEntry.ID, entry.getId());
    writer.deleteDocuments(queryTerm);//  w w w  .  j  ava  2 s.  c  om
    if (entry.getCurrent() != null) {
        List<Document> docs = createDocument(new HashSet<String>(), entry);
        for (Document document : docs) {
            writer.addDocument(document);
        }
    }
    writer.commit();
    writer.close();
    reader.close();
    reader = DirectoryReader.open(ram);
    searcher = new IndexSearcher(reader);
}

From source file:demo.jaxrs.search.server.Catalog.java

License:Apache License

private void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {

    try (BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content))) {

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();/* www  .j av a 2 s.  co  m*/
            } finally {
                writer.close();
            }
        }
    }
}

From source file:demo.jaxrs.search.server.Indexer.java

License:Apache License

public void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {
    BufferedInputStream in = null;

    try {/*from ww w.  j  a  v  a  2s.c om*/
        in = new BufferedInputStream(new ByteArrayInputStream(content));

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();
            } finally {
                writer.close();
            }
        }
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException ex) {
                /* do nothing */ }
        }
    }
}

From source file:Demo1.MyServlet.java

private static void addDoc(IndexWriter w, String Name, String Price, String Area, String Purpose)
        throws IOException {
    Document doc = new Document();
    // A text field will be tokenized
    doc.add(new StringField("name", Name, Field.Store.YES));
    // We use a string field for isbn because we don\'t want it tokenized
    doc.add(new StringField("price", Price, Field.Store.YES));
    doc.add(new StringField("area", Area, Field.Store.YES));
    doc.add(new StringField("purpose", Purpose, Field.Store.YES));
    String Searching = Name + " " + Price + " " + Area + " " + Purpose;
    doc.add(new TextField("Searching", Searching, Field.Store.NO));
    w.addDocument(doc);
}