Example usage for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value)

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:index.IndexOmimtsv.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);/*ww  w  .j  av a  2  s  . co  m*/

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("20110227030432", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        InputStreamReader ipsr = new InputStreamReader(stream);
        BufferedReader br = new BufferedReader(ipsr);
        String line = br.readLine();
        int cpt = 0;
        while ((line = br.readLine()) != null) {
            String[] tokens = line.trim().split("\t");
            if (tokens.length > 6) {
                String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim();
                if (id.matches("^[0-9]*")) {
                    doc = new Document();
                    cpt++;
                    doc.add(new TextField("ID", id, Field.Store.NO));
                    if (!tokens[5].trim().matches("^C[0-9].*")) {
                        for (String token : tokens) {
                            if (token.trim().matches("^C[0-9].*")) {
                                doc.add(new StoredField("CUI", token.trim()));
                                break;
                            }
                        }
                        if (doc.getFields().size() != 2)
                            doc.add(new StoredField("CUI", ""));
                    } else
                        doc.add(new StoredField("CUI", tokens[5].trim()));
                    doc.add(new StoredField("Label", tokens[1].trim()));
                    writer.addDocument(doc);
                }
            }
        }
        System.out.println("Nombre d'lments : " + cpt);
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(
                    new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv",
                            file.toString()),
                    doc);
        }
    }
}

From source file:indexer.DocVector.java

public Document constructDoc() {
    Document doc = new Document();

    doc.add(new Field(FIELD_ID, docName == null ? "" : docName, Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Store the vectors as byte arrays (in binary format rather than text format
    // which takes more space...
    doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x)));

    //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO));

    return doc;//w ww  .ja  va 2s  .c o  m
}

From source file:indexer.DocVector.java

public Document constructDoc(DocVector wholeDocvec) {
    Document doc = new Document();

    doc.add(new Field(FIELD_SUBVEC_ID, docName == null ? "" : docName, Field.Store.YES,
            Field.Index.NOT_ANALYZED));
    doc.add(new Field(FIELD_ID, wholeDocvec.docName == null ? "" : wholeDocvec.docName, Field.Store.YES,
            Field.Index.NOT_ANALYZED));

    // Store the vectors as byte arrays (in binary format rather than text format
    // which takes more space...
    doc.add(new StoredField(FIELD_PARENT_VEC, wholeDocvec.getVecBytes(x)));
    doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x)));

    //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO));

    return doc;//from w  ww. j  ava  2s .  co  m
}

From source file:io.anserini.embeddings.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");
    long startTime = System.currentTimeMillis();
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(directory, config);

    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;//from  ww  w . j  a va  2 s.co  m
    bRdr.readLine();

    Document document = new Document();
    ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
    int cnt = 0;

    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
        String[] parts = termEmbedding[1].split(" ");

        for (int i = 0; i < parts.length; ++i) {
            byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
        }
        document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));

        byteStream.flush();
        byteStream.reset();
        writer.addDocument(document);
        document.clear();
        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " terms indexed");
        }
    }

    LOG.info(String.format("Total of %s terms added", cnt));

    try {
        writer.commit();
        writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }

    LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}

From source file:io.anserini.embeddings.search.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");

    final Directory dir = FSDirectory.open(indexPath);
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setUseCompoundFile(false);//from  w ww.jav  a  2 s  .  c  o m
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);
    Document document = new Document();
    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;
    bRdr.readLine();
    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES));
        document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1]));
    }
}

From source file:io.anserini.index.generator.LuceneDocumentGenerator.java

License:Apache License

public Document createDocument(SourceDocument src) {
    String id = src.id();//from w w  w .j a  v  a  2 s . co  m
    String contents;

    try {
        // If there's a transform, use it.
        contents = transform != null ? transform.apply(src.content()) : src.content();
    } catch (Exception e) {
        LOG.error("Error extracting document text, skipping document: " + id, e);
        counters.errors.incrementAndGet();
        return null;
    }

    if (contents.trim().length() == 0) {
        LOG.info("Empty document: " + id);
        counters.emptyDocuments.incrementAndGet();
        return null;
    }

    // make a new, empty document
    Document document = new Document();

    // document id
    document.add(new StringField(FIELD_ID, id, Field.Store.YES));

    if (args.storeRawDocs) {
        document.add(new StoredField(FIELD_RAW, src.content()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    document.add(new Field(FIELD_BODY, contents, fieldType));

    return document;
}

From source file:io.anserini.index.generator.TweetGenerator.java

License:Apache License

@Override
public Document createDocument(TweetCollection.Document tweetDoc) {
    String id = tweetDoc.id();/*from  w  w  w .  j  av  a2s  .  co  m*/

    if (tweetDoc.content().trim().isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }
    final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim());
    if (!result.isValid) {
        counters.errors.incrementAndGet();
        return null;
    }
    String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end);

    if (!args.tweetKeepUrls) {
        final Extractor extractor = new Extractor();
        final List<String> urls = extractor.extractURLs(text);
        for (String url : urls) {
            text = text.replaceAll(url, "");
        }
    }
    text = text.trim();
    if (text.isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }

    // Skip deletes tweetids.
    if (deletes != null && deletes.contains(id)) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (tweetDoc.getIdLong() > args.tweetMaxId) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
        counters.skipped.incrementAndGet();
        return null;
    }

    Document doc = new Document();
    doc.add(new StringField(FIELD_ID, id, Field.Store.YES));

    // We need this to break scoring ties.
    doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong()));
    doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong()));

    tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch)));
    doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
    doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
    doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
    doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));

    tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid));
        tweetDoc.getInReplyToUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid)));
    });

    tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid));
        tweetDoc.getRetweetedUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid)));
        tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc)));
    });

    tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO)));

    if (args.storeRawDocs) { // store the raw json string as one single field
        doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    doc.add(new Field(FIELD_BODY, text, fieldType));

    return doc;
}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

public static void indexNumbersMethod() {
    new StoredField("size", 4096);
    new StoredField("price", 10.99);
    new StoredField("author", "Arthur C. Clark");
}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

/**
 * #1 Good domain boost factor: 1.5/* w w  w  . j  av a 2  s  . c  o  m*/
 * 
 * #2 Bad domain boost factor: 0.1
 */
public void docBoostMethod() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,
            AosAnalyser.NO_LIMIT_TOKEN_COUNT_SIMPLE_ANALYSER);
    IndexWriter writer = new IndexWriter(dir, conf);

    Document doc = new Document();
    String senderEmail = getSenderEmail();
    String senderName = getSenderName();
    String subject = getSubject();
    String body = getBody();
    doc.add(new StoredField("senderEmail", senderEmail));
    doc.add(new Field("senderName", senderName, AosFieldType.INDEXED_STORED_TERMVECTORS));
    doc.add(new Field("subject", subject, AosFieldType.INDEXED_STORED_TERMVECTORS));
    doc.add(new Field("body", body, AosFieldType.INDEXED_STORED_TERMVECTORS));
    String lowerDomain = getSenderDomain().toLowerCase();
    if (isImportant(lowerDomain)) {
        // doc.setBoost(1.5F);
    } else if (isUnimportant(lowerDomain)) {
        // doc.setBoost(0.1F);
    }

    writer.addDocument(doc);

    writer.close();

}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

public void numberField() {
    Document doc = new Document();
    doc.add(new StoredField("price", 19.99));
}