Example usage for org.apache.lucene.document StoredField StoredField

List of usage examples for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value) 

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:index.IndexOmimtsv.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);/*ww  w  .j  av a  2  s  . co  m*/

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("20110227030432", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        InputStreamReader ipsr = new InputStreamReader(stream);
        BufferedReader br = new BufferedReader(ipsr);
        String line = br.readLine();
        int cpt = 0;
        while ((line = br.readLine()) != null) {
            String[] tokens = line.trim().split("\t");
            if (tokens.length > 6) {
                String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim();
                if (id.matches("^[0-9]*")) {
                    doc = new Document();
                    cpt++;
                    doc.add(new TextField("ID", id, Field.Store.NO));
                    if (!tokens[5].trim().matches("^C[0-9].*")) {
                        for (String token : tokens) {
                            if (token.trim().matches("^C[0-9].*")) {
                                doc.add(new StoredField("CUI", token.trim()));
                                break;
                            }
                        }
                        if (doc.getFields().size() != 2)
                            doc.add(new StoredField("CUI", ""));
                    } else
                        doc.add(new StoredField("CUI", tokens[5].trim()));
                    doc.add(new StoredField("Label", tokens[1].trim()));
                    writer.addDocument(doc);
                }
            }
        }
        System.out.println("Nombre d'lments : " + cpt);
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(
                    new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv",
                            file.toString()),
                    doc);
        }
    }
}

From source file:indexer.DocVector.java

public Document constructDoc() {
    Document doc = new Document();

    doc.add(new Field(FIELD_ID, docName == null ? "" : docName, Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Store the vectors as byte arrays (in binary format rather than text format
    // which takes more space...
    doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x)));

    //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO));

    return doc;//w ww  .ja  va 2s  .c o  m
}

From source file:indexer.DocVector.java

public Document constructDoc(DocVector wholeDocvec) {
    Document doc = new Document();

    doc.add(new Field(FIELD_SUBVEC_ID, docName == null ? "" : docName, Field.Store.YES,
            Field.Index.NOT_ANALYZED));
    doc.add(new Field(FIELD_ID, wholeDocvec.docName == null ? "" : wholeDocvec.docName, Field.Store.YES,
            Field.Index.NOT_ANALYZED));

    // Store the vectors as byte arrays (in binary format rather than text format
    // which takes more space...
    doc.add(new StoredField(FIELD_PARENT_VEC, wholeDocvec.getVecBytes(x)));
    doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x)));

    //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO));

    return doc;//from w  ww. j  ava  2s .  co  m
}

From source file:io.anserini.embeddings.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");
    long startTime = System.currentTimeMillis();
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(directory, config);

    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;//from  ww  w . j  a va  2 s.co  m
    bRdr.readLine();

    Document document = new Document();
    ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
    int cnt = 0;

    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
        String[] parts = termEmbedding[1].split(" ");

        for (int i = 0; i < parts.length; ++i) {
            byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
        }
        document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));

        byteStream.flush();
        byteStream.reset();
        writer.addDocument(document);
        document.clear();
        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " terms indexed");
        }
    }

    LOG.info(String.format("Total of %s terms added", cnt));

    try {
        writer.commit();
        writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }

    LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}

From source file:io.anserini.embeddings.search.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");

    final Directory dir = FSDirectory.open(indexPath);
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setUseCompoundFile(false);//from  w ww.jav  a  2 s  .  c  o m
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);
    Document document = new Document();
    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;
    bRdr.readLine();
    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES));
        document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1]));
    }
}

From source file:io.anserini.index.generator.LuceneDocumentGenerator.java

License:Apache License

public Document createDocument(SourceDocument src) {
    String id = src.id();//from w w  w .j a  v  a  2 s . co  m
    String contents;

    try {
        // If there's a transform, use it.
        contents = transform != null ? transform.apply(src.content()) : src.content();
    } catch (Exception e) {
        LOG.error("Error extracting document text, skipping document: " + id, e);
        counters.errors.incrementAndGet();
        return null;
    }

    if (contents.trim().length() == 0) {
        LOG.info("Empty document: " + id);
        counters.emptyDocuments.incrementAndGet();
        return null;
    }

    // make a new, empty document
    Document document = new Document();

    // document id
    document.add(new StringField(FIELD_ID, id, Field.Store.YES));

    if (args.storeRawDocs) {
        document.add(new StoredField(FIELD_RAW, src.content()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    document.add(new Field(FIELD_BODY, contents, fieldType));

    return document;
}

From source file:io.anserini.index.generator.TweetGenerator.java

License:Apache License

@Override
public Document createDocument(TweetCollection.Document tweetDoc) {
    String id = tweetDoc.id();/*from  w  w  w .  j  av  a2s  .  co  m*/

    if (tweetDoc.content().trim().isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }
    final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim());
    if (!result.isValid) {
        counters.errors.incrementAndGet();
        return null;
    }
    String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end);

    if (!args.tweetKeepUrls) {
        final Extractor extractor = new Extractor();
        final List<String> urls = extractor.extractURLs(text);
        for (String url : urls) {
            text = text.replaceAll(url, "");
        }
    }
    text = text.trim();
    if (text.isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }

    // Skip deletes tweetids.
    if (deletes != null && deletes.contains(id)) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (tweetDoc.getIdLong() > args.tweetMaxId) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
        counters.skipped.incrementAndGet();
        return null;
    }

    Document doc = new Document();
    doc.add(new StringField(FIELD_ID, id, Field.Store.YES));

    // We need this to break scoring ties.
    doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong()));
    doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong()));

    tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch)));
    doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
    doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
    doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
    doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));

    tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid));
        tweetDoc.getInReplyToUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid)));
    });

    tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid));
        tweetDoc.getRetweetedUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid)));
        tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc)));
    });

    tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO)));

    if (args.storeRawDocs) { // store the raw json string as one single field
        doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    doc.add(new Field(FIELD_BODY, text, fieldType));

    return doc;
}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

public static void indexNumbersMethod() {
    new StoredField("size", 4096);
    new StoredField("price", 10.99);
    new StoredField("author", "Arthur C. Clark");
}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

/**
 * #1 Good domain boost factor: 1.5/* w w  w  . j  av a 2  s  . c  o  m*/
 * 
 * #2 Bad domain boost factor: 0.1
 */
public void docBoostMethod() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,
            AosAnalyser.NO_LIMIT_TOKEN_COUNT_SIMPLE_ANALYSER);
    IndexWriter writer = new IndexWriter(dir, conf);

    Document doc = new Document();
    String senderEmail = getSenderEmail();
    String senderName = getSenderName();
    String subject = getSubject();
    String body = getBody();
    doc.add(new StoredField("senderEmail", senderEmail));
    doc.add(new Field("senderName", senderName, AosFieldType.INDEXED_STORED_TERMVECTORS));
    doc.add(new Field("subject", subject, AosFieldType.INDEXED_STORED_TERMVECTORS));
    doc.add(new Field("body", body, AosFieldType.INDEXED_STORED_TERMVECTORS));
    String lowerDomain = getSenderDomain().toLowerCase();
    if (isImportant(lowerDomain)) {
        // doc.setBoost(1.5F);
    } else if (isUnimportant(lowerDomain)) {
        // doc.setBoost(0.1F);
    }

    writer.addDocument(doc);

    writer.close();

}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

public void numberField() {
    Document doc = new Document();
    doc.add(new StoredField("price", 19.99));
}