Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

License:Apache License

public void create_ngram_index(File ngram_joined_counts_file) throws IOException {
    File index_dir = new File(_index_dir, "ngram");
    if (index_dir.exists()) {
        LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();//from   w w  w  .  ja  v a2  s. c  o  m
        } else
            return;
    }
    index_dir.mkdirs();

    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    // use 80 percent of the available total memory
    double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6;
    double percentage_ram_buffer = Properties.ramBufferPercentage();
    if (percentage_ram_buffer > 0) {
        double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer;
        LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)",
                percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb));
        iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb);
    }

    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_ngram = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(ngram_joined_counts_file);
    if (ngram_joined_counts_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));

    Document doc = new Document();
    Field f_ngram = new StringField("ngram", "", Store.YES);
    doc.add(f_ngram);
    Field f_n = new IntField("cardinality", 0, Store.YES);
    doc.add(f_n);
    Field f_word = new StringField("word", "", Store.YES);
    doc.add(f_word);
    Field f_hist = new StringField("history", "", Store.YES);
    doc.add(f_hist);
    Field f_lower = new StringField("lower", "", Store.YES);
    doc.add(f_lower);
    Field f_count = new StoredField("num", 0L);
    doc.add(f_count);

    Field[] f_follow = new Field[4];
    f_follow[0] = new StoredField("nf_s", 0L);
    doc.add(f_follow[0]);
    f_follow[1] = new StoredField("nf_N1", 0L);
    doc.add(f_follow[1]);
    f_follow[2] = new StoredField("nf_N2", 0L);
    doc.add(f_follow[2]);
    f_follow[3] = new StoredField("nf_N3", 0L);
    doc.add(f_follow[3]);
    Field[] f_precede = new Field[4];
    f_precede[0] = new StoredField("np_s", 0L);
    doc.add(f_precede[0]);
    f_precede[1] = new StoredField("np_N1", 0L);
    doc.add(f_precede[1]);
    f_precede[2] = new StoredField("np_N2", 0L);
    doc.add(f_precede[2]);
    f_precede[3] = new StoredField("np_N3", 0L);
    doc.add(f_precede[3]);
    Field[] f_followerprecede = new Field[4];
    f_followerprecede[0] = new StoredField("nfp_s", 0L);
    doc.add(f_followerprecede[0]);
    f_followerprecede[1] = new StoredField("nfp_N1", 0L);
    doc.add(f_followerprecede[1]);
    f_followerprecede[2] = new StoredField("nfp_N2", 0L);
    doc.add(f_followerprecede[2]);
    f_followerprecede[3] = new StoredField("nfp_N3", 0L);
    doc.add(f_followerprecede[3]);

    Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } };
    Long[] S = new Long[] { 0L };
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 100000 == 0)
            LOG.info("Adding {}'th ngram.", c);
        String line = iter.next();
        try {
            String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t");
            String ngram_str = splits[0];
            if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) {
                LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line,
                        ngram_joined_counts_file);
                continue;
            }

            List<String> ngram = Arrays.asList(ngram_str.split(" "));
            long num = Long.parseLong(splits[1]);
            int n = ngram.size();

            f_ngram.setStringValue(ngram_str);
            f_n.setIntValue(n);
            f_word.setStringValue(ngram.get(ngram.size() - 1));
            f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " "));
            f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " "));
            f_count.setLongValue(num);

            for (int j = 0; j < f_follow.length; j++) {
                f_follow[j].setLongValue(0L);
                f_precede[j].setLongValue(0L);
                f_followerprecede[j].setLongValue(0L);
            }

            if (splits.length > 2 && !splits[2].isEmpty()) {
                // precede or follow or followerprecede
                String[] splits_ = splits[2].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 3 && !splits[3].isEmpty()) {
                // should be follow or followerprecede
                String[] splits_ = splits[3].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 4 && !splits[4].isEmpty()) {
                // should be followerprecede
                String[] splits_ = splits[4].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }

            writer_ngram.addDocument(doc);

            while (N.length <= n) {
                N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } });
                S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L });
            }

            if (num == 1L)
                N[n][1]++;
            else if (num == 2L)
                N[n][2]++;
            else if (num == 3L)
                N[n][3]++;
            else if (num == 4L)
                N[n][4]++;
            else
                N[n][5]++;
            N[n][0]++;
            S[n] += num;

        } catch (Exception e) {
            LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line,
                    ngram_joined_counts_file, c, e);
        }
    }

    writer_ngram.forceMergeDeletes();
    writer_ngram.commit();
    writer_ngram.close();

    StringBuilder b = new StringBuilder(String.format(
            "#%n# Number of times where an ngram occurred: %n#  at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n",
            N.length - 1));
    for (int n = 1; n < N.length; n++)
        b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ',')));
    for (int n = 1; n < S.length; n++)
        b.append(String.format("s%d=%d%n", n, S[n]));
    FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString());

}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

License:Apache License

public void create_vocabulary_index(File vocabulary_file) throws IOException {
    File index_dir = new File(_index_dir, "vocab");
    if (index_dir.exists()) {
        LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();//from   www .  j a v  a 2s .  c  o m
        } else
            return;
    }
    index_dir.mkdirs();
    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(1024.0);
    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_vocab = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(vocabulary_file);
    if (vocabulary_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));
    Document doc = new Document();
    Field f_word = new StringField("word", "", Field.Store.YES);
    doc.add(f_word);
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 10000 == 0)
            LOG.info("Adding {}'th word.", c);
        String line = iter.next();
        try {
            String word = line.trim();
            f_word.setStringValue(word);
            writer_vocab.addDocument(doc);
        } catch (Exception e) {
            LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e);
        }
    }

    writer_vocab.forceMergeDeletes();
    writer_vocab.commit();
    writer_vocab.close();
}

From source file:de.u808.simpleinquest.indexer.impl.IndexUpdater.java

License:Apache License

private void indexDocuments(List<File> files)
        throws CorruptIndexException, LockObtainFailedException, IOException {
    IndexWriter indexWriter = new IndexWriter(indexDirectory, new StandardAnalyzer());
    Iterator<File> iterator = files.iterator();
    while (iterator.hasNext()) {
        File file = (File) iterator.next();
        if (file.isDirectory()) {
            Document doc = DirectoryDocument.Document(file);
            indexWriter.addDocument(doc);
        } else {/*from ww  w .  ja  va 2  s.  co m*/
            Indexer indexer = indexerFactory.getIndexer(file);
            if (indexer != null) {
                Document document = null;
                try {
                    log.debug("Memory before indexing in MB (M: "
                            + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: "
                            + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + "F: "
                            + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")");
                    this.ensureEnoughHeapMemory();
                    String msg = "Indexing file: " + file.getPath();
                    document = indexer.indexFile(file);
                    this.setStatusMessage(msg);
                    log.info(msg);
                    log.debug("Memory after indexing in MB (M: "
                            + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: "
                            + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + " F: "
                            + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")");
                    iterator.remove();
                } catch (IndexerException e) {
                    log.error("Error during indexing", e);
                } catch (OutOfMemoryError outOfMemoryError) {
                    log.warn(
                            "File seems to be to big for the actual free heap. Try to increase availible memory with vm option -Xmx if this is a recurring error message");
                    log.info("Try to free memory");
                    document = null;
                    System.gc();
                    this.refreschIndex();
                }
                if (document != null) {
                    indexWriter.addDocument(document);
                } else {
                    String msg = "Indexer " + indexer.getClass() + " returned no content to index";
                    this.setStatusMessage(msg);
                    log.warn(msg);
                }
            } else {
                log.debug("No indexer for file: " + file.getPath());
            }
        }
    }
    String msg = "Optimizing index";
    this.setStatusMessage(msg);
    log.info(msg);
    indexWriter.flush();
    indexWriter.optimize();
    msg = "Index optimized";
    this.setStatusMessage(msg);
    log.info(msg);
    indexWriter.close(true);
    indexWriter = null;
}

From source file:de.unidue.inf.is.ezdl.dlservices.search.handlers.ranking.LuceneRanker.java

License:Open Source License

private void createIndex(ResultDocumentList toRank, IndexWriter indexWriter)
        throws CorruptIndexException, IOException {
    for (ResultDocument result : toRank) {
        Document document = result.getDocument();
        org.apache.lucene.document.Document d = new org.apache.lucene.document.Document();

        StringBuilder sb = new StringBuilder();

        String oid = document.getOid();

        Field.Store store = Field.Store.NO;

        Field field;/* w w  w .  j ava2 s  .c  om*/
        if (!StringUtils.isEmpty(oid)) {
            field = new Field("oid", oid, Field.Store.YES, Field.Index.NO);
            d.add(field);
            String title = document.getTitle();
            if (!StringUtils.isEmpty(title)) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TITLE.toString(), title, store,
                        Field.Index.ANALYZED);
                field.setOmitNorms(true);
                field.setBoost(2.0f);
                d.add(field);
                sb.append(title);
                sb.append(" ");
            }
            if (document instanceof TextDocument) {
                String docAbstract = ((TextDocument) document).getAbstract();
                if (!StringUtils.isEmpty(docAbstract)) {
                    field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.ABSTRACT.toString(),
                            docAbstract, store, Field.Index.ANALYZED);
                    d.add(field);
                    sb.append(docAbstract);
                    sb.append(" ");
                }
            }
            int year = document.getYear();
            if (year != 0) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.YEAR.toString(),
                        String.valueOf(year), store, Field.Index.NOT_ANALYZED);
                d.add(field);
                sb.append(" ");
                sb.append(year);
            }
            PersonList authorList = document.getAuthorList();
            if (authorList != null) {
                field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.AUTHOR.toString(),
                        authorList.toString(), store, Field.Index.ANALYZED);
                d.add(field);
                sb.append(authorList.toString());
            }
            field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TEXT.toString(),
                    sb.toString().toString(), store, Field.Index.ANALYZED);
            d.add(field);

            indexWriter.addDocument(d);
        }
    }
    indexWriter.commit();
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java

License:Apache License

private int indexDocs(final IndexWriter writer, final Iterator<LexEntry> iterator) throws IOException {
    int counter = 0;
    NumberFormat nf = NumberFormat.getNumberInstance();
    while (iterator.hasNext()) {
        LexEntry lexEntry = iterator.next();
        List<Document> docs = createDocument(lexEntry);
        if (tracing) {
            logger.trace("Indexing Documents: " + docs);
        }/*from w  w  w. j av  a2s.com*/
        for (Document doc : docs) {
            writer.addDocument(doc);
        }
        counter++;
        if (counter % 10000 == 0) {
            logger.debug("Indexed " + nf.format(counter) + " documents.");
        }
    }
    logger.info("###########################################");
    logger.info("Indexing completed - " + nf.format(counter) + " entries have been indexed.");
    logger.info("###########################################");
    return counter;
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java

License:Apache License

void update(LexEntry entry) throws IOException {
    IndexWriter writer = initIndexWriter();
    Term queryTerm = new Term(LexEntry.ID, entry.getId());
    writer.deleteDocuments(queryTerm);/*from w  w w  . ja va  2s.  com*/
    if (entry.getCurrent() != null) {
        List<Document> docs = createDocument(entry);
        for (Document document : docs) {
            writer.addDocument(document);
        }
    }
    writer.close();
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryLoader.java

License:Apache License

void update(LexEntry entry) throws IOException {
    IndexWriter writer = initIndexWriter();
    Term queryTerm = new Term(LexEntry.ID, entry.getId());
    writer.deleteDocuments(queryTerm);//  w w w  .  j  ava  2 s.  c  om
    if (entry.getCurrent() != null) {
        List<Document> docs = createDocument(new HashSet<String>(), entry);
        for (Document document : docs) {
            writer.addDocument(document);
        }
    }
    writer.commit();
    writer.close();
    reader.close();
    reader = DirectoryReader.open(ram);
    searcher = new IndexSearcher(reader);
}

From source file:demo.jaxrs.search.server.Catalog.java

License:Apache License

private void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {

    try (BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content))) {

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();/* www  .j av a 2 s.  co  m*/
            } finally {
                writer.close();
            }
        }
    }
}

From source file:demo.jaxrs.search.server.Indexer.java

License:Apache License

public void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {
    BufferedInputStream in = null;

    try {/*from ww w.  j  a  v  a  2s.c om*/
        in = new BufferedInputStream(new ByteArrayInputStream(content));

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();
            } finally {
                writer.close();
            }
        }
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException ex) {
                /* do nothing */ }
        }
    }
}

From source file:Demo1.MyServlet.java

private static void addDoc(IndexWriter w, String Name, String Price, String Area, String Purpose)
        throws IOException {
    Document doc = new Document();
    // A text field will be tokenized
    doc.add(new StringField("name", Name, Field.Store.YES));
    // We use a string field for isbn because we don\'t want it tokenized
    doc.add(new StringField("price", Price, Field.Store.YES));
    doc.add(new StringField("area", Area, Field.Store.YES));
    doc.add(new StringField("purpose", Purpose, Field.Store.YES));
    String Searching = Name + " " + Price + " " + Area + " " + Purpose;
    doc.add(new TextField("Searching", Searching, Field.Store.NO));
    w.addDocument(doc);
}