Example usage for org.apache.lucene.document StoredField StoredField

List of usage examples for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value) 

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:luceneindexer.files.LuceneWriter.java

public void addPark(Park park) {
    Document doc = new Document();
    doc.add(new TextField("name", park.getname(), Field.Store.YES));
    doc.add(new StoredField("coordinates", park.getPos().toString()));
    try {//from w ww.  jav a 2s. c  o m
        indexWriter.addDocument(doc);
    } catch (IOException ex) {
        System.out.println(
                "Threw an exception trying to add the doc: " + ex.getClass() + " :: " + ex.getMessage());
    }
    System.out.println(park.getname());

}

From source file:mw.wikidump.MakeLuceneIndex.java

License:Open Source License

/**
 * @param args//from w w  w.jav  a 2  s.co  m
 * @throws IOException
 * @throws ParseException
 */
public static void main(String[] args) throws IOException, ParseException {
    String baseDir = "";
    String wikiDumpFile = "enwiki-20110405-pages-articles.xml";
    String luceneIndexName = "enwiki-20110405-lucene";
    String logFile = luceneIndexName + ".log";
    boolean bIgnoreStubs = false;
    String writeToTextFilesDir = "";

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-logfile"))
            logFile = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-ignorestubs"))
            bIgnoreStubs = true;

        if (args[i].equals("-writetotextfilesdir")) {
            writeToTextFilesDir = args[++i];
        }
    }

    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("tokenized_title", new StandardAnalyzer());
    analyzerPerField.put("contents", new StandardAnalyzer());

    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);

    File basePath = new File(baseDir);
    File luceneIndex = new File(basePath.getCanonicalPath() + File.separator + luceneIndexName);

    logFile = basePath.getCanonicalPath() + File.separator + logFile;

    // log to file and console:
    // PlainLogger logger = new PlainLogger( logFile );
    // log only to console:
    PlainLogger logger = new PlainLogger();

    logger.log("Work directory:     " + basePath.getCanonicalPath());
    logger.log("Lucene index:       " + luceneIndexName);
    logger.log("Wikipedia dumpfile: " + wikiDumpFile);
    logger.log("");
    if (bIgnoreStubs)
        logger.log("Ignoring stubs");
    else
        logger.log("Including stubs");
    logger.log("");

    // create the index
    Directory indexDirectory = FSDirectory.open(FileSystems.getDefault().getPath(baseDir));
    IndexWriter indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(analyzer));

    Extractor wikidumpExtractor = new Extractor(basePath.getCanonicalPath() + File.separator + wikiDumpFile);
    wikidumpExtractor.setLinkSeparator("_");
    wikidumpExtractor.setCategorySeparator("_");

    int iStubs = 0;
    int iArticleCount = 0;
    int iSkippedPageCount = 0;
    long iStartTime = java.lang.System.nanoTime();
    long iTime = iStartTime;

    FieldType fieldType = new FieldType();
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectors(true);
    fieldType.setStoreTermVectorPositions(true);

    while (wikidumpExtractor.nextPage()) {
        if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
            ++iSkippedPageCount;
            continue;
        }

        if (bIgnoreStubs && wikidumpExtractor.getStub()) {
            ++iStubs;
            continue;
        }

        Document doc = new Document();
        ++iArticleCount;

        doc.add(new StoredField("path", String.format("%d", iArticleCount)));

        wikidumpExtractor.setTitleSeparator("_");
        String title = wikidumpExtractor.getPageTitle(false).toLowerCase();
        doc.add(new Field("title", title, fieldType));

        wikidumpExtractor.setTitleSeparator(" ");
        doc.add(new Field("tokenized_title", wikidumpExtractor.getPageTitle(false).toLowerCase(), fieldType));

        doc.add(new Field("categories", wikidumpExtractor.getPageCategories().toLowerCase(), fieldType));
        doc.add(new Field("links", wikidumpExtractor.getPageLinks().toLowerCase(), fieldType));
        doc.add(new Field("contents", wikidumpExtractor.getPageAbstract().toLowerCase(), fieldType));

        indexWriter.addDocument(doc);

        if (!writeToTextFilesDir.isEmpty()) {
            String fileName = doc.get("title");
            fileName = fileName.replace('/', '_');
            writeToTextFile(writeToTextFilesDir, fileName, doc.get("contents"));
        }

        if (iArticleCount % 50000 == 0) {
            logger.add(iArticleCount + " (" + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s) ");
            iTime = System.nanoTime();

            if (iArticleCount % 250000 == 0) {
                try {
                    indexWriter.commit();
                    logger.add(
                            "-- commit. Skipped page count " + iSkippedPageCount + " (+ " + iStubs + " stubs)");
                    logger.log(String.format(", time %sm",
                            NanoTimeFormatter.getM(System.nanoTime() - iStartTime)));
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    logger.log("");
    logger.log(
            String.format("Overall time %s minutes, ", NanoTimeFormatter.getM(System.nanoTime() - iStartTime)));
    logger.add("collected " + iArticleCount + " articles, ");
    logger.add("skipped " + iSkippedPageCount + " nonarticle pages,");
    logger.log("skipped " + iStubs + " stubs.");
    logger.log("");

    iTime = System.nanoTime();
    logger.add(" closing...");
    indexWriter.close();
    logger.log(" done in " + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s.");

    logger.close();
    System.exit(0);
}

From source file:net.chwise.indexing.DocumentFromWikitextExtractor.java

License:Open Source License

public Document getLuceneDocument(SimpleFieldToFieldProcessor simpleFieldToFieldProcessor, String pathStr,
        String title, String smiles, String wikitext) throws LinkTargetException, EngineException {
    // Retrieve a page
    PageTitle pageTitle = PageTitle.make(config, pathStr);

    PageId pageId = new PageId(pageTitle, -1);

    // Compile the retrieved page
    EngProcessedPage cp = engine.postprocess(pageId, wikitext, null);

    //Check chembox in the begining

    TextConverter markupStripConverter = new TextConverter(config, wrapCol);
    String text = (String) markupStripConverter.go(cp.getPage());

    InfoBoxDataExtractor infoBoxDataExtractor = new InfoBoxDataExtractor();
    Map<String, String> infoboxFields = (Map<String, String>) infoBoxDataExtractor.go(cp.getPage());

    //Update statistics if required
    if (calculateStatistics) {
        for (String key : infoboxFields.keySet()) {
            int count = infoboxKeysCout.containsKey(key) ? infoboxKeysCout.get(key) : 0;
            infoboxKeysCout.put(key, count + 1);
        }//www .j a  v a2 s.  co  m
    }

    //Create lucene document
    Document document = new Document();
    document.add(new TextField(DocDefinitions.TITLE_FIELD_NAME, title, Field.Store.YES));
    document.add(new TextField(DocDefinitions.TEXT_FIELD_NAME, text, Field.Store.YES));
    document.add(new TextField(DocDefinitions.STRUCTURE_SMILES_FIELD_NAME, smiles, Field.Store.YES));
    document.add(new TextField(DocDefinitions.URL_FIELD_NAME, "#", Field.Store.YES));
    document.add(new StoredField(DocDefinitions.STRUCTURE_MOL_FIELD_NAME,
            toMOLConverter.MOLChargesKludge(toMOLConverter.convert(smiles))));

    simpleFieldToFieldProcessor.process(infoboxFields, document);

    return document;
}

From source file:net.semanticmetadata.lire.builders.AbstractLocalDocumentBuilder.java

License:Open Source License

/**
 * Creates the Lucene Fiels with the vector representation of list of local features.
 * @param listOfLocalFeatures is the list of local features.
 * @param extractorItem is the extractor that was used to extract the features.
 * @param listOfCodebooks is the list which can contain one or more codebooks to be used for the aggregation of the local features.
 * @return Lucene Fields with the vector representation of the list of local features.
 *///from  w  w  w  .j  a  v  a 2  s .  c o  m
public Field[] createLocalDescriptorFields(List<? extends LocalFeature> listOfLocalFeatures,
        ExtractorItem extractorItem, LinkedList<Cluster[]> listOfCodebooks) {
    Field[] result = new Field[listOfCodebooks.size() * 2];
    int count = 0;
    for (Cluster[] codebook : listOfCodebooks) {
        aggregator.createVectorRepresentation(listOfLocalFeatures, codebook);
        result[count] = new StoredField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[0],
                aggregator.getByteVectorRepresentation());
        result[count + 1] = new TextField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[1],
                aggregator.getStringVectorRepresentation(), Field.Store.YES);
        count += 2;
    }

    return result;
}

From source file:net.semanticmetadata.lire.builders.GlobalDocumentBuilder.java

License:Open Source License

/**
 * Extracts the global feature and returns the Lucene Fields for the selected image.
 *
 * @param image         is the selected image.
 * @param extractorItem is the extractor to be used to extract the features.
 * @return Lucene Fields.// www.  j av a  2 s.c o  m
 */
private Field[] getGlobalDescriptorFields(BufferedImage image, ExtractorItem extractorItem) {
    Field[] result;
    //        if (hashingEnabled) result = new Field[2];
    //        else result = new Field[1];
    Field hash = null;
    Field vector = null;

    GlobalFeature globalFeature = extractGlobalFeature(image,
            (GlobalFeature) extractorItem.getExtractorInstance());

    if (!useDocValues) {
        // TODO: Stored field is compressed and upon search decompression takes a lot of time (> 50% with a small index with 50k images). Find something else ...
        vector = new StoredField(extractorItems.get(extractorItem)[0],
                new BytesRef(globalFeature.getByteArrayRepresentation()));
    } else {
        // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely.
        vector = new BinaryDocValuesField(extractorItems.get(extractorItem)[0],
                new BytesRef(globalFeature.getByteArrayRepresentation()));
    }

    // if BitSampling is an issue we add a field with the given hashFunctionsFileName and the suffix "hash":
    if (hashingEnabled) {
        // TODO: check eventually if there is a more compressed string version of the integers. i.e. the hex string
        if (globalFeature.getFeatureVector().length <= 3100) {
            int[] hashes;
            if (hashingMode == HashingMode.BitSampling) {
                hashes = BitSampling.generateHashes(globalFeature.getFeatureVector());
                hash = new TextField(extractorItems.get(extractorItem)[1],
                        SerializationUtils.arrayToString(hashes), Field.Store.YES);
            } else if (hashingMode == HashingMode.LSH) {
                hashes = LocalitySensitiveHashing.generateHashes(globalFeature.getFeatureVector());
                hash = new TextField(extractorItems.get(extractorItem)[1],
                        SerializationUtils.arrayToString(hashes), Field.Store.YES);
            } else if (hashingMode == HashingMode.MetricSpaces) {
                if (MetricSpaces.supportsFeature(globalFeature)) {
                    // the name of the field is set at "addExtractor" time.
                    hash = new TextField(extractorItems.get(extractorItem)[1],
                            MetricSpaces.generateHashString(globalFeature), Field.Store.YES);
                }
            }
        } else
            System.err.println("Could not create hashes, feature vector too long: "
                    + globalFeature.getFeatureVector().length + " (" + globalFeature.getClass().getName()
                    + ")");
    }
    if (hash != null)
        result = new Field[] { vector, hash };
    else
        result = new Field[] { vector };
    return result;
}

From source file:net.semanticmetadata.lire.GeneralTest.java

License:Open Source License

public void testCreateAndSearchSmallIndex() throws IOException {
    for (int i = 0, buildersLength = builders.length; i < buildersLength; i++) {
        DocumentBuilder b = builders[i];
        // create an index with a specific builder:
        IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-small", true);
        for (String identifier : testFiles) {
            Document doc = b.createDocument(new FileInputStream(testFilesPath + identifier), identifier);
            doc.add(new StoredField("video_file", "surgery1.mp4"));
            doc.add(new StoredField("timestamp", "25"));
            iw.addDocument(doc);/*  w  w w.j a v a  2s .  co  m*/
        }
        iw.close();

        ImageSearcher s = searchers[i];
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-small")));
        for (int k = 0; k < reader.maxDoc(); k++) {
            Document query = reader.document(k);
            ImageSearchHits hits = s.search(query, reader);
            for (int y = 0; y < hits.length(); y++) {
                Document result = hits.doc(y);
                if (y == 0) {
                    // check if the first result is the query:
                    assertEquals(result.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]
                            .equals(query.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), true);
                    System.out.println(result.getValues("video_file")[0]);
                } else {
                    // check if they are ordered by distance:
                    assertEquals(hits.score(y) < hits.score(y - 1), true);
                }
            }
        }
    }
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    double[] tmpHist = new double[numClusters];
    Arrays.fill(tmpHist, 0d);//  www  .  j av a  2 s.  c  om
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(visualWordsFieldName);
    d.removeField(localFeatureHistFieldName);

    // find the appropriate cluster for each feature:
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        tmpHist[clusterForFeature((Histogram) f)]++;
    }
    //quantize(tmpHist);
    d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
    d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist)));
    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist));
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    double[] tmpHist = new double[numClusters];
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(visualWordsFieldName);
    d.removeField(localFeatureHistFieldName);

    // find the appropriate cluster for each feature:
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        tmpHist[clusterForFeature((Histogram) f, clusters)]++;
    }//from   ww  w  .ja  v a 2  s  . c o  m
    d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist)));
    //quantize(tmpHist);
    d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist));
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from  w  w w.j a  v  a  2s. com
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
    KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
    // fill the KMeans object:
    LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        //            features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(new DoublePoint(f.getDoubleHistogram()));
        }
    }
    if (features.size() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(features.size()));
    System.out.println("Starting clustering ...");
    List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
    // TODO: Serializing clusters to a file on the disk ...
    System.out.println("Clustering finished, " + clusterList.size() + " clusters found");
    clusters = new LinkedList<double[]>();
    for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) {
        CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        clusters.add(centroidCluster.getCenter().getPoint());
    }
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    LireFeature f = getFeatureInstance();
    for (int i = 0; i < reader.maxDoc(); i++) {
        try {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue;
            for (int j = 0; j < tmpHist.length; j++) {
                tmpHist[j] = 0;
            }
            Document d = reader.document(i);
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // remove the fields if they are already there ...
            d.removeField(visualWordsFieldName);
            d.removeField(localFeatureHistFieldName);

            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature(f, clusters)]++;
            }
            //                System.out.println(Arrays.toString(tmpHist));
            d.add(new StoredField(localFeatureHistFieldName,
                    SerializationUtils.toByteArray(normalize(tmpHist))));
            quantize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

            // remove local features to save some space if requested:
            if (DELETE_LOCAL_FEATURES) {
                d.removeFields(localFeatureFieldName);
            }
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted local features.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java

License:Open Source License

private void createVisualWords(Document d, LireFeature f) {
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // remove the fields if they are already there ...
    d.removeField(vladFieldName);//from w  ww  .ja v a 2 s  . c o  m
    d.removeField(vladHistFieldName);
    double[] vlad = new double[clusters.length * (clusters[0].getMean()).length];
    Arrays.fill(vlad, 0d);
    int clusterIndex;
    double[] mean;
    // VLAD - Vector of Locally Aggregated Descriptors
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                fields[j].binaryValue().length);
        clusterIndex = clusterForFeature((Histogram) f);
        //            System.out.println("clusterIndex = " + clusterIndex);
        mean = clusters[clusterIndex].getMean();
        for (int k = 0; k < f.getDoubleHistogram().length; k++) {
            //                System.out.println((clusterIndex*f.getDoubleHistogram().length+k) + " - mean: " + mean.length + " - feature: " + f.getDoubleHistogram().length);
            vlad[clusterIndex * f.getDoubleHistogram().length + k] += f.getDoubleHistogram()[k] - mean[k];
        }
    }
    normalize(vlad);
    GenericDoubleLireFeature feat = new GenericDoubleLireFeature();
    feat.setData(vlad);
    //        System.out.println(feat.getStringRepresentation());
    d.add(new TextField(vladFieldName, feat.getStringRepresentation(), Field.Store.YES));
    d.add(new StoredField(vladHistFieldName, feat.getByteArrayRepresentation()));

    // remove local features to save some space if requested:
    if (DELETE_LOCAL_FEATURES) {
        d.removeFields(localFeatureFieldName);
    }

    // for debugging ..
    //        System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(vlad));
}