Example usage for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value)

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:luceneindexer.files.LuceneWriter.java

public void addPark(Park park) {
    Document doc = new Document();
    doc.add(new TextField("name", park.getname(), Field.Store.YES));
    doc.add(new StoredField("coordinates", park.getPos().toString()));
    try {//from w ww.  jav a 2s. c  o m
        indexWriter.addDocument(doc);
    } catch (IOException ex) {
        System.out.println(
                "Threw an exception trying to add the doc: " + ex.getClass() + " :: " + ex.getMessage());
    }
    System.out.println(park.getname());

}

From source file:mw.wikidump.MakeLuceneIndex.java

License:Open Source License

/**
 * @param args//from w w  w.jav  a 2  s.co  m
 * @throws IOException
 * @throws ParseException
 */
public static void main(String[] args) throws IOException, ParseException {
    String baseDir = "";
    String wikiDumpFile = "enwiki-20110405-pages-articles.xml";
    String luceneIndexName = "enwiki-20110405-lucene";
    String logFile = luceneIndexName + ".log";
    boolean bIgnoreStubs = false;
    String writeToTextFilesDir = "";

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-logfile"))
            logFile = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-ignorestubs"))
            bIgnoreStubs = true;

        if (args[i].equals("-writetotextfilesdir")) {
            writeToTextFilesDir = args[++i];
        }
    }

    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("tokenized_title", new StandardAnalyzer());
    analyzerPerField.put("contents", new StandardAnalyzer());

    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);

    File basePath = new File(baseDir);
    File luceneIndex = new File(basePath.getCanonicalPath() + File.separator + luceneIndexName);

    logFile = basePath.getCanonicalPath() + File.separator + logFile;

    // log to file and console:
    // PlainLogger logger = new PlainLogger( logFile );
    // log only to console:
    PlainLogger logger = new PlainLogger();

    logger.log("Work directory:     " + basePath.getCanonicalPath());
    logger.log("Lucene index:       " + luceneIndexName);
    logger.log("Wikipedia dumpfile: " + wikiDumpFile);
    logger.log("");
    if (bIgnoreStubs)
        logger.log("Ignoring stubs");
    else
        logger.log("Including stubs");
    logger.log("");

    // create the index
    Directory indexDirectory = FSDirectory.open(FileSystems.getDefault().getPath(baseDir));
    IndexWriter indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(analyzer));

    Extractor wikidumpExtractor = new Extractor(basePath.getCanonicalPath() + File.separator + wikiDumpFile);
    wikidumpExtractor.setLinkSeparator("_");
    wikidumpExtractor.setCategorySeparator("_");

    int iStubs = 0;
    int iArticleCount = 0;
    int iSkippedPageCount = 0;
    long iStartTime = java.lang.System.nanoTime();
    long iTime = iStartTime;

    FieldType fieldType = new FieldType();
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectors(true);
    fieldType.setStoreTermVectorPositions(true);

    while (wikidumpExtractor.nextPage()) {
        if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
            ++iSkippedPageCount;
            continue;
        }

        if (bIgnoreStubs && wikidumpExtractor.getStub()) {
            ++iStubs;
            continue;
        }

        Document doc = new Document();
        ++iArticleCount;

        doc.add(new StoredField("path", String.format("%d", iArticleCount)));

        wikidumpExtractor.setTitleSeparator("_");
        String title = wikidumpExtractor.getPageTitle(false).toLowerCase();
        doc.add(new Field("title", title, fieldType));

        wikidumpExtractor.setTitleSeparator(" ");
        doc.add(new Field("tokenized_title", wikidumpExtractor.getPageTitle(false).toLowerCase(), fieldType));

        doc.add(new Field("categories", wikidumpExtractor.getPageCategories().toLowerCase(), fieldType));
        doc.add(new Field("links", wikidumpExtractor.getPageLinks().toLowerCase(), fieldType));
        doc.add(new Field("contents", wikidumpExtractor.getPageAbstract().toLowerCase(), fieldType));

        indexWriter.addDocument(doc);

        if (!writeToTextFilesDir.isEmpty()) {
            String fileName = doc.get("title");
            fileName = fileName.replace('/', '_');
            writeToTextFile(writeToTextFilesDir, fileName, doc.get("contents"));
        }

        if (iArticleCount % 50000 == 0) {
            logger.add(iArticleCount + " (" + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s) ");
            iTime = System.nanoTime();

            if (iArticleCount % 250000 == 0) {
                try {
                    indexWriter.commit();
                    logger.add(
                            "-- commit. Skipped page count " + iSkippedPageCount + " (+ " + iStubs + " stubs)");
                    logger.log(String.format(", time %sm",
                            NanoTimeFormatter.getM(System.nanoTime() - iStartTime)));
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    logger.log("");
    logger.log(
            String.format("Overall time %s minutes, ", NanoTimeFormatter.getM(System.nanoTime() - iStartTime)));
    logger.add("collected " + iArticleCount + " articles, ");
    logger.add("skipped " + iSkippedPageCount + " nonarticle pages,");
    logger.log("skipped " + iStubs + " stubs.");
    logger.log("");

    iTime = System.nanoTime();
    logger.add(" closing...");
    indexWriter.close();
    logger.log(" done in " + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s.");

    logger.close();
    System.exit(0);
}

From source file:net.chwise.indexing.DocumentFromWikitextExtractor.java

License:Open Source License

public Document getLuceneDocument(SimpleFieldToFieldProcessor simpleFieldToFieldProcessor, String pathStr,
        String title, String smiles, String wikitext) throws LinkTargetException, EngineException {
    // Retrieve a page
    PageTitle pageTitle = PageTitle.make(config, pathStr);

    PageId pageId = new PageId(pageTitle, -1);

    // Compile the retrieved page
    EngProcessedPage cp = engine.postprocess(pageId, wikitext, null);

    //Check chembox in the begining

    TextConverter markupStripConverter = new TextConverter(config, wrapCol);
    String text = (String) markupStripConverter.go(cp.getPage());

    InfoBoxDataExtractor infoBoxDataExtractor = new InfoBoxDataExtractor();
    Map<String, String> infoboxFields = (Map<String, String>) infoBoxDataExtractor.go(cp.getPage());

    //Update statistics if required
    if (calculateStatistics) {
        for (String key : infoboxFields.keySet()) {
            int count = infoboxKeysCout.containsKey(key) ? infoboxKeysCout.get(key) : 0;
            infoboxKeysCout.put(key, count + 1);
        }//www .j a  v a2 s.  co  m
    }

    //Create lucene document
    Document document = new Document();
    document.add(new TextField(DocDefinitions.TITLE_FIELD_NAME, title, Field.Store.YES));
    document.add(new TextField(DocDefinitions.TEXT_FIELD_NAME, text, Field.Store.YES));
    document.add(new TextField(DocDefinitions.STRUCTURE_SMILES_FIELD_NAME, smiles, Field.Store.YES));
    document.add(new TextField(DocDefinitions.URL_FIELD_NAME, "#", Field.Store.YES));
    document.add(new StoredField(DocDefinitions.STRUCTURE_MOL_FIELD_NAME,
            toMOLConverter.MOLChargesKludge(toMOLConverter.convert(smiles))));

    simpleFieldToFieldProcessor.process(infoboxFields, document);

    return document;
}

From source file:net.semanticmetadata.lire.builders.AbstractLocalDocumentBuilder.java

License:Open Source License

/**
 * Creates the Lucene Fiels with the vector representation of list of local features.
 * @param listOfLocalFeatures is the list of local features.
 * @param extractorItem is the extractor that was used to extract the features.
 * @param listOfCodebooks is the list which can contain one or more codebooks to be used for the aggregation of the local features.
 * @return Lucene Fields with the vector representation of the list of local features.
 *///from  w  w  w  .j  a  v  a 2  s .  c o  m
public Field[] createLocalDescriptorFields(List<? extends LocalFeature> listOfLocalFeatures,
        ExtractorItem extractorItem, LinkedList<Cluster[]> listOfCodebooks) {
    Field[] result = new Field[listOfCodebooks.size() * 2];
    int count = 0;
    for (Cluster[] codebook : listOfCodebooks) {
        aggregator.createVectorRepresentation(listOfLocalFeatures, codebook);
        result[count] = new StoredField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[0],
                aggregator.getByteVectorRepresentation());
        result[count + 1] = new TextField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[1],
                aggregator.getStringVectorRepresentation(), Field.Store.YES);
        count += 2;
    }

    return result;
}

From source file:net.semanticmetadata.lire.builders.GlobalDocumentBuilder.java

License:Open Source License

/**
 * Extracts the global feature and returns the Lucene Fields for the selected image.
 *
 * @param image         is the selected image.
 * @param extractorItem is the extractor to be used to extract the features.
 * @return Lucene Fields.// www.  j av a  2 s.c o  m
 */
private Field[] getGlobalDescriptorFields(BufferedImage image, ExtractorItem extractorItem) {
    Field[] result;
    //        if (hashingEnabled) result = new Field[2];
    //        else result = new Field[1];
    Field hash = null;
    Field vector = null;

    GlobalFeature globalFeature = extractGlobalFeature(image,
            (GlobalFeature) extractorItem.getExtractorInstance());

    if (!useDocValues) {
        // TODO: Stored field is compressed and upon search decompression takes a lot of time (> 50% with a small index with 50k images). Find something else ...
        vector = new StoredField(extractorItems.get(extractorItem)[0],
                new BytesRef(globalFeature.getByteArrayRepresentation()));
    } else {
        // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely.
        vector = new BinaryDocValuesField(extractorItems.get(extractorItem)[0],
                new BytesRef(globalFeature.getByteArrayRepresentation()));
    }

    // if BitSampling is an issue we add a field with the given hashFunctionsFileName and the suffix "hash":
    if (hashingEnabled) {
        // TODO: check eventually if there is a more compressed string version of the integers. i.e. the hex string
        if (globalFeature.getFeatureVector().length <= 3100) {
            int[] hashes;
            if (hashingMode == HashingMode.BitSampling) {
                hashes = BitSampling.generateHashes(globalFeature.getFeatureVector());
                hash = new TextField(extractorItems.get(extractorItem)[1],
                        SerializationUtils.arrayToString(hashes), Field.Store.YES);
            } else if (hashingMode == HashingMode.LSH) {
                hashes = LocalitySensitiveHashing.generateHashes(globalFeature.getFeatureVector());
                hash = new TextField(extractorItems.get(extractorItem)[1],
                        SerializationUtils.arrayToString(hashes), Field.Store.YES);
            } else if (hashingMode == HashingMode.MetricSpaces) {
                if (MetricSpaces.supportsFeature(globalFeature)) {
                    // the name of the field is set at "addExtractor" time.
                    hash = new TextField(extractorItems.get(extractorItem)[1],
                            MetricSpaces.generateHashString(globalFeature), Field.Store.YES);
                }
            }
        } else
            System.err.println("Could not create hashes, feature vector too long: "
                    + globalFeature.getFeatureVector().length + " (" + globalFeature.getClass().getName()
                    + ")");
    }
    if (hash != null)
        result = new Field[] { vector, hash };
    else
        result = new Field[] { vector };
    return result;
}

From source file:net.semanticmetadata.lire.GeneralTest.java

License:Open Source License

public void testCreateAndSearchSmallIndex() throws IOException {
    for (int i = 0, buildersLength = builders.length; i < buildersLength; i++) {
        DocumentBuilder b = builders[i];
        // create an index with a specific builder:
        IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-small", true);
        for (String identifier : testFiles) {
            Document doc = b.createDocument(new FileInputStream(testFilesPath + identifier), identifier);
            doc.add(new StoredField("video_file", "surgery1.mp4"));
            doc.add(new StoredField("timestamp", "25"));
            iw.addDocument(doc);/*  w  w w.j a v a  2s .  co  m*/
        }
        iw.close();

        ImageSearcher s = searchers[i];
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-small")));
        for (int k = 0; k < reader.maxDoc(); k++) {
            Document query = reader.document(k);
            ImageSearchHits hits = s.search(query, reader);
            for (int y = 0; y < hits.length(); y++) {
                Document result = hits.doc(y);
                if (y == 0) {
                    // check if the first result is the query:
                    assertEquals(result.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]
                            .equals(query.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), true);
                    System.out.println(result.getValues("video_file")[0]);
                } else {
                    // check if they are ordered by distance:
                    assertEquals(hits.score(y) < hits.score(y - 1), true);
                }
            }
        }
    }
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    double[] tmpHist = new double[numClusters];
    Arrays.fill(tmpHist, 0d);//  www  .  j av a  2 s.  c  om
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(visualWordsFieldName);
    d.removeField(localFeatureHistFieldName);

    // find the appropriate cluster for each feature:
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        tmpHist[clusterForFeature((Histogram) f)]++;
    }
    //quantize(tmpHist);
    d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
    d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist)));
    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist));
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    double[] tmpHist = new double[numClusters];
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(visualWordsFieldName);
    d.removeField(localFeatureHistFieldName);

    // find the appropriate cluster for each feature:
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        tmpHist[clusterForFeature((Histogram) f, clusters)]++;
    }//from   ww  w  .ja  v a 2  s  . c o  m
    d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist)));
    //quantize(tmpHist);
    d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist));
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from  w  w w.j a  v  a  2s. com
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
    KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
    // fill the KMeans object:
    LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        //            features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(new DoublePoint(f.getDoubleHistogram()));
        }
    }
    if (features.size() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(features.size()));
    System.out.println("Starting clustering ...");
    List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
    // TODO: Serializing clusters to a file on the disk ...
    System.out.println("Clustering finished, " + clusterList.size() + " clusters found");
    clusters = new LinkedList<double[]>();
    for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) {
        CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        clusters.add(centroidCluster.getCenter().getPoint());
    }
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    LireFeature f = getFeatureInstance();
    for (int i = 0; i < reader.maxDoc(); i++) {
        try {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue;
            for (int j = 0; j < tmpHist.length; j++) {
                tmpHist[j] = 0;
            }
            Document d = reader.document(i);
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // remove the fields if they are already there ...
            d.removeField(visualWordsFieldName);
            d.removeField(localFeatureHistFieldName);

            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature(f, clusters)]++;
            }
            //                System.out.println(Arrays.toString(tmpHist));
            d.add(new StoredField(localFeatureHistFieldName,
                    SerializationUtils.toByteArray(normalize(tmpHist))));
            quantize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

            // remove local features to save some space if requested:
            if (DELETE_LOCAL_FEATURES) {
                d.removeFields(localFeatureFieldName);
            }
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted local features.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(vladFieldName);//from w  ww  .ja v a 2 s  . c o  m
    d.removeField(vladHistFieldName);
    double[] vlad = new double[clusters.length * (clusters[0].getMean()).length];
    Arrays.fill(vlad, 0d);
    int clusterIndex;
    double[] mean;
    // VLAD - Vector of Locally Aggregated Descriptors
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        clusterIndex = clusterForFeature((Histogram) f);
        //            System.out.println("clusterIndex = " + clusterIndex);
        mean = clusters[clusterIndex].getMean();
        for (int k = 0; k < f.getDoubleHistogram().length; k++) {
            //                System.out.println((clusterIndex*f.getDoubleHistogram().length+k) + " - mean: " + mean.length + " - feature: " + f.getDoubleHistogram().length);
            vlad[clusterIndex * f.getDoubleHistogram().length + k] += f.getDoubleHistogram()[k] - mean[k];
        }
    }
    normalize(vlad);
    GenericDoubleLireFeature feat = new GenericDoubleLireFeature();
    feat.setData(vlad);
    //        System.out.println(feat.getStringRepresentation());
    d.add(new TextField(vladFieldName, feat.getStringRepresentation(), Field.Store.YES));
    d.add(new StoredField(vladHistFieldName, feat.getByteArrayRepresentation()));

    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(vlad));
}