Example usage for org.apache.lucene.document StoredField StoredField

List of usage examples for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value) 

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:org.xwiki.contrib.repository.pypi.internal.searching.PypiPackageListIndexUpdateTask.java

License:Open Source License

private Document createNewDocument(String packageName, String version) {
    Document document = new Document();
    document.add(new TextField(LuceneParameters.PACKAGE_NAME, packageName, Field.Store.YES));
    document.add(new StringField(LuceneParameters.ID, packageName, Field.Store.YES));
    document.add(new StoredField(LuceneParameters.VERSION, version));
    return document;
}

From source file:part1.IndexBusiness.java

/**
 * @param args//from w  ww .j  ava 2 s  .co  m
 *            the command line arguments
 */
public static void main(String[] args) throws FileNotFoundException, IOException {
    // TODO code application logic here
    Analyzer analyzer = new StandardAnalyzer();
    Directory dir = FSDirectory.open(Paths.get("index_business"));
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, iwc);

    Gson gson = new Gson();
    File f = new File("/Users/arpitkhandelwal/Downloads/yelp_dataset_challenge_academic_dataset/business.json");
    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    int count = 0;
    try {
        while ((line = reader.readLine()) != null) {
            JsonObject r = gson.fromJson(line, JsonObject.class);
            // System.out.println(r.get("attributes").toString());
            JsonObject a = gson.fromJson(r.get("attributes").toString(), JsonObject.class);
            // a.keySet();
            Set alist = a.entrySet();

            Business r1 = gson.fromJson(line, Business.class);
            Document luceneDoc = new Document();
            luceneDoc.add(new TextField("fulladdress", r1.full_address, Field.Store.YES));
            luceneDoc.add(new StringField("businessid", r1.business_id, Field.Store.YES));
            luceneDoc.add(new StringField("city", r1.city, Field.Store.YES));
            luceneDoc.add(new StringField("open", String.valueOf(r1.open), Field.Store.YES));
            luceneDoc.add(new StringField("name", r1.name, Field.Store.YES));
            luceneDoc.add(new StoredField("longitude", r1.longitude));
            luceneDoc.add(new StoredField("latitude", r1.latitude));
            luceneDoc.add(new StoredField("reviewcount", r1.review_count));
            luceneDoc.add(new StringField("state", r1.state, Field.Store.YES));
            luceneDoc.add(new StoredField("stars", r1.stars));
            luceneDoc.add(new StringField("type", r1.type, Field.Store.YES));
            for (String c : r1.categories)
                luceneDoc.add(new SortedSetDocValuesField("category", new BytesRef(c)));
            for (String c : r1.neighborhoods)
                luceneDoc.add(new SortedSetDocValuesField("neighborhoods", new BytesRef(c)));
            for (Object attribute : alist) {
                String attributedetail[] = attribute.toString().split("=");
                String key = attributedetail[0].trim();
                String value = attributedetail[1].trim();
                luceneDoc.add(new TextField(key, value, Field.Store.YES));
            }
            writer.addDocument(luceneDoc);
        }

    } catch (Exception e) {
        System.out.println(line);
        e.printStackTrace();
    } finally {
        reader.close();
        writer.close();
    }

}

From source file:perf.PrintPerFieldHeapUsage.java

License:Apache License

public static void main(String[] args) throws IOException {
    Directory dir = FSDirectory.open(Paths.get("fields"));

    int fieldUpto;
    IndexWriterConfig iwc;/*from w w w.  ja  va 2  s  .  c  om*/
    IndexWriter w;
    long t0;
    IndexReader r;

    // Stored field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new StoredField("f" + fieldUpto, "text" + i));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StoredField: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Indexed StringField:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new StringField("f" + fieldUpto, "text" + i, Field.Store.NO));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StringField: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Numeric DV field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField("f" + fieldUpto, i));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT,
            "Took %.1f sec; bytes per unique NumericDocValuesField, latent: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    // Now force lazy loading of all the DV fields:
    for (int i = 0; i < FIELD_COUNT; i++) {
        MultiDocValues.getNumericValues(r, "f" + i);
    }
    System.out.println(String.format(Locale.ROOT, "Bytes per unique NumericDocValuesField, loaded: %.1f",
            (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Sorted DV field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new SortedDocValuesField("f" + fieldUpto, new BytesRef("text" + i)));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT,
            "Took %.1f sec; bytes per unique SortedDocValuesField, latent: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    // Now force lazy loading of all the DV fields:
    for (int i = 0; i < FIELD_COUNT; i++) {
        MultiDocValues.getSortedValues(r, "f" + i);
    }
    System.out.println(String.format(Locale.ROOT, "Bytes per unique SortedDocValuesField, loaded: %.1f",
            (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    dir.close();
}

From source file:perLucene.IndexerThread.java

License:Open Source License

public boolean addDoc(String language, String summary, String text, long date, String wkt, byte[] gdid,
        long id) {
    Analyzer analyzer = ha.get(language);

    Document doc = new Document();
    doc.add(new TextField("summary", summary, Field.Store.NO));
    doc.add(new TextField("text", text, Field.Store.NO));
    doc.add(new LongDocValuesField("uid", id));
    doc.add(new DerefBytesDocValuesField("language", new BytesRef(language)));
    doc.add(new LongField("date", date, Field.Store.NO));
    doc.add(new StoredField("gdid", gdid));
    doc.add(new LongField("id", id, Field.Store.NO));
    // ?? for/*w w w.  jav a 2s . co  m*/
    // deletions we
    // need
    // LongField to
    // do search on
    // id
    if (sp.addFields(doc, wkt)) {
        try {
            w.addDocument(doc, analyzer);
        } catch (Exception e) {
            System.out.println("Couldnt add Doc");
            System.out.println("Stacktrace " + e.toString());

            try {
                if (w != null) {
                    w.close();
                }
            } catch (IOException e1) {
            }

            System.exit(-1);
        }

        return true;
    } else {
        return false;
    }
}

From source file:slib.sml.sm.core.measures.corpus.VocContextMatrixBuilder.java

License:Open Source License

public static void main(String[] args) throws SLIB_Ex_Critic, IOException {

    String[] ext = { "txt" };
    List<File> files = FileUtils.listFilesForFolder("/data/tmp/wiki/", Arrays.asList(ext), 100000);

    List<String> docField = new ArrayList<String>();
    docField.add("content");

    String[] vocArray = { "lion", "panthera", "Africa", "lamb", "insecticides", "animal", "Genealogists",
            "rugby", "football", "Sydney", "Australia" };
    Set<String> voc = new HashSet<String>(Arrays.asList(vocArray));

    MatrixType matrixType = MatrixType.WORD_WORD;
    VocContextMatrixBuilder matrixBuilder = new VocContextMatrixBuilder(matrixType, voc, docField);

    for (File f : files) {
        Document doc = new Document();
        String fileAsString = FileUtils.readFile(f.getAbsolutePath(), Charset.defaultCharset());
        doc.add(new StoredField("content", fileAsString));
        matrixBuilder.process(doc);/*from   www. j  a v  a  2 s .  co m*/
    }

    //        Document docA = new Document();
    //        docA.add(new StoredField("title", "Lion article"));
    //        docA.add(new StoredField("content", "The lion (Panthera leo) is one of the four big cats in the genus Panthera and a member of the family Felidae. With some males exceeding 250 kg (550 lb) in weight,[4] it is the second-largest living cat after the tiger. Wild lions currently exist in sub-Saharan Africa and in Asia (where an endangered remnant population resides in Gir Forest National Park in India) while other types of lions have disappeared from North Africa and Southwest Asia in historic times. Until the late Pleistocene, about 10,000 years ago, the lion was the most widespread large land mammal after humans"));
    //
    //        Document docB = new Document();
    //        docB.add(new StoredField("title", "Java (programming language)"));
    //        docB.add(new StoredField("content", "Java is a computer programming language that is concurrent, class-based, object-oriented, and specifically designed to have as few implementation dependencies as possible. It is intended to let application developers \"write once, run anywhere\" (WORA), meaning that code that runs on one platform does not need to be recompiled to run on another. Java applications are typically compiled to bytecode (class file) that can run on any Java virtual machine (JVM) regardless of computer architecture. Java is, as of 2014, one of the most popular programming languages in use, particularly for client-server web applications, with a reported 9 million developers.[10][11] Java was originally developed by James Gosling at Sun Microsystems (which has since merged into Oracle Corporation) and released in 1995 as a core component of Sun Microsystems' Java platform. The language derives much of its syntax from C and C++, but it has fewer low-level facilities than either of them."));
    //
    //        Document docC = new Document();
    //        docC.add(new StoredField("title", "Java (programming language)"));
    //        docC.add(new StoredField("content", "Java is a computer programming language that is concurrent, class-based, object-oriented, and specifically designed to have as few implementation dependencies as possible. It is intended to let application developers \"write once, run anywhere\" (WORA), meaning that code that runs on one platform does not need to be recompiled to run on another. Java applications are typically compiled to bytecode (class file) that can run on any Java virtual machine (JVM) regardless of computer architecture. Java is, as of 2014, one of the most popular programming languages in use, particularly for client-server web applications, with a reported 9 million developers.[10][11] Java was originally developed by James Gosling at Sun Microsystems (which has since merged into Oracle Corporation) and released in 1995 as a core component of Sun Microsystems' Java platform. The language derives much of its syntax from C and C++, but it has fewer low-level facilities than either of them."));
    //
    //        docs.add(docA);
    //        docs.add(docB);
    //        docs.add(docC);
    //        matrixBuilder.buildMatrix(docs);

    Matrix<String, String> mat = matrixBuilder.getMatrix();

    System.out.println("size: " + mat.getInternalStorage().keySet().size());

    for (String s : mat.getInternalStorage().keySet()) {
        System.out.println(
                s + "\t(" + mat.getInternalStorage().get(s).size() + ")\t" + mat.getInternalStorage().get(s));
    }
}

From source file:spatialluceneindexer.files.LuceneWriter.java

public void addPark(Park park) {
    Document doc = new Document();

    doc.add(new TextField("name", park.getname(), Field.Store.YES));

    //First we make the shape, then we make the indexed field from it. This field can not be stored
    //This assumes there is always only one shape per document while there could be multiple
    Shape pointShape = spatialContext.makePoint(park.getPos().get(0).doubleValue(),
            park.getPos().get(1).doubleValue());
    for (IndexableField f : spatialStrategy.createIndexableFields(pointShape)) {
        doc.add(f);/*from  w  ww  .j  ava  2s. c  o  m*/
    }

    //now let's store the field as well - could be useful to return this to the client
    doc.add(new StoredField("coords", spatialContext.toString(pointShape)));

    try {
        indexWriter.addDocument(doc);
    } catch (IOException ex) {
        System.out.println(
                "Threw an exception trying to add the doc: " + ex.getClass() + " :: " + ex.getMessage());
    }
    System.out.println(park.getname());

}

From source file:start.lucene.CPeptidesIndexer.java

/**
 * This method prepares a Document from a given line for indexing
 *
 * @param line//from  w  w  w  .jav a 2  s .  co m
 * @return
 * @throws IOException
 */
public Document getDocument(StringBuilder line) throws IOException {
    Document doc = new Document();
    int maxDoc = getIndexWriter().maxDoc(), id = maxDoc++;
    // Fill document like "name-value" pair.
    doc.add(new IntField(FieldName.ID, id, Field.Store.YES)); // proteinA name
    String[] sp = line.toString().split("\t");
    // Except mass, all is StringField but not TextField because
    // A text field is a sequence of terms that has been tokenized and punctuation and spacing are ignored-good for keyword search
    // while a string field is a single term with literal character strings with all punctuation, and cannot tokenized (only for atomic values), spacing,and case preserved
    // StringField is always indexed since Lucene4.0
    // StoredField is for storing but not indexing at all (and so, is not searchable).
    // StoredField(String name, String value)creates a stored-only field with the given string value       
    doc.add(new StoredField(FieldName.PROTEINA, sp[0]));
    doc.add(new StoredField(FieldName.PROTEINB, sp[1])); // proteinB name
    doc.add(new StoredField(FieldName.PEPTIDEA, sp[2])); // peptideA sequence
    doc.add(new StoredField(FieldName.PEPTIDEB, sp[3])); // peptideB sequence
    doc.add(new StoredField(FieldName.LINKA, sp[4])); // proteinA name
    doc.add(new StoredField(FieldName.LINKB, sp[5])); // proteinB name
    doc.add(new StoredField(FieldName.FIXMODA, sp[6])); // linkerPeptideA
    doc.add(new StoredField(FieldName.FIXMODB, sp[7])); // linkerPeptideB
    doc.add(new StoredField(FieldName.VARMODA, sp[8])); // ModificationsPeptideA
    doc.add(new StoredField(FieldName.VARMODB, sp[9])); // ModificationsPeptideB
    //doc.add(new StringField("mass", sp[10], Field.Store.YES)); // Mass    
    doc.add(new DoubleField(FieldName.MASS, Double.parseDouble(sp[10]), indexedFieldType));
    if (sp.length > 11) {
        doc.add(new StoredField(FieldName.TYPE, sp[11])); //Type
        doc.add(new StoredField(FieldName.LABEL, sp[12])); // Labeling-true:Heavylabeled
    }
    return doc;
}

From source file:uk.ac.open.kmi.squire.index.RDFDatasetIndexer.java

public Document indexSignature(String urlAddress, String graphName, IRDFDataset indexand,
        Collection<String> propertySet, boolean overwrite) {

    if (alreadyIndexed(urlAddress, graphName) && !overwrite) {
        log.warn("Already indexed: {}{}", urlAddress, graphName == null ? "" : "::" + graphName);
        log.warn(" ... overwrite not set, so not indexing.");
        return null;
    }/*from   w w w.j ava 2  s . c o  m*/
    Analyzer analyzer = new StandardAnalyzer(); // = new WhitespaceAnalyzer();
    IndexWriter indexWriter;
    /*
     * IndexWriterConfig.OpenMode.CREATE_OR_APPEND if used IndexWriter will create a
     * new index if there is not already an index at the provided path and otherwise
     * open the existing index.
     */
    IndexWriterConfig config = new IndexWriterConfig(analyzer);// .setOpenMode(OpenMode.CREATE_OR_APPEND);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    try {
        indexWriter = new IndexWriter(getIndex(), config);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // For every dataset a document
    Document doc = new Document();

    // XXX AA I think the values are so because it is assumed that Set#toString()
    // prints [ one, two, ... ] but can it be trusted?
    doc.add(new Field("URL", urlAddress, Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("GraphName", graphName, Field.Store.YES, Field.Index.NOT_ANALYZED));
    // doc.add(new Field("ClassSet", indexand.getClassSet().toString(),
    // Field.Store.YES, Field.Index.NO));
    doc.add(new Field("ObjectPropertySet", indexand.getObjectPropertySet().toString(), Field.Store.YES,
            Field.Index.NO));
    doc.add(new Field("DatatypePropertySet", indexand.getDatatypePropertySet().toString(), Field.Store.YES,
            Field.Index.NO));
    doc.add(new Field("LiteralSet", indexand.getLiteralSet().toString(), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("IndividualSet", indexand.getIndividualSet().toString(), Field.Store.YES,
            Field.Index.NO));
    doc.add(new Field("RDFVocabulary", indexand.getRDFVocabulary().toString(), Field.Store.YES,
            Field.Index.NO));
    if (propertySet != null && !propertySet.isEmpty())
        doc.add(new Field("PropertySet", propertySet.toString(), Field.Store.YES, Field.Index.NO));

    // TODO the new way of using Lucene, apply to rest

    doc.add(new StoredField(Fieldd.ClassSet.toString(), indexand.getClassSet().toString()));
    JsonObject jSign = new JsonObject();
    for (Entry<String, ClassSignature> entry : indexand.getClassSignatures().entrySet())
        jSign.put(entry.getKey(), entry.getValue().jsonifyPaths());

    try {
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        JSON.write(os, jSign);
        doc.add(new StoredField(Fieldd.CLASS_SIGNATURES.toString(), new String(os.toByteArray(), "UTF-8")));
    } catch (UnsupportedEncodingException e1) {
        shutdown(indexWriter);
        throw new RuntimeException("UTF-8 not supported. Seriously?", e1);
    }

    // Remove the old one(s) if any
    Builder queryBuilder = new Builder();
    queryBuilder.add(new TermQuery(new Term("URL", urlAddress)), BooleanClause.Occur.MUST);
    if (graphName != null && !graphName.isEmpty())
        queryBuilder.add(new TermQuery(new Term("GraphName", graphName)), BooleanClause.Occur.MUST);
    try {
        indexWriter.deleteDocuments(queryBuilder.build());
        indexWriter.addDocument(doc);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        shutdown(indexWriter);
    }
    return doc;
}

From source file:webdocs.ClueWebDoc.java

@Override
Document constructLuceneDoc() {/*from   w ww. ja  va2s .  c om*/
    Document doc = new Document();
    doc.add(new Field(FIELD_ID, this.docNo, Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field(WTDOC_FIELD_URL, this.url, Field.Store.YES, Field.Index.NOT_ANALYZED));

    // store the title and the raw html
    doc.add(new Field(WTDOC_FIELD_TITLE, this.title == null ? "" : this.title, Field.Store.YES,
            Field.Index.ANALYZED, Field.TermVector.NO));

    String ppHTML = html;
    try {
        ppHTML = preProcessHTML(html);
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    doc.add(new StoredField(WTDOC_FIELD_HTML, compress(ppHTML)));

    // the words only... no term vectors 
    doc.add(new Field(FIELD_ANALYZED_CONTENT, this.text, Field.Store.NO, Field.Index.ANALYZED,
            Field.TermVector.NO));

    return doc;
}

From source file:webdocs.WebDocAnalyzer.java

Document constructLuceneDoc() {
    Document doc = new Document();
    doc.add(new Field(FIELD_ID, this.docNo, Field.Store.YES, Field.Index.NOT_ANALYZED));

    // store the title and the raw html
    doc.add(new Field(WTDOC_FIELD_TITLE, this.title == null ? "" : this.title, Field.Store.YES,
            Field.Index.ANALYZED, Field.TermVector.NO));
    doc.add(new StoredField(WTDOC_FIELD_HTML, compress(html)));

    // the words (also store the term vector)
    doc.add(new Field(FIELD_ANALYZED_CONTENT, this.text, Field.Store.NO, Field.Index.ANALYZED,
            Field.TermVector.NO));

    return doc;//from  w  w w  . ja  v a2s  . com
}