List of usage examples for org.apache.lucene.document StoredField StoredField
public StoredField(String name, double value)
From source file:luceneindexer.files.LuceneWriter.java
public void addPark(Park park) { Document doc = new Document(); doc.add(new TextField("name", park.getname(), Field.Store.YES)); doc.add(new StoredField("coordinates", park.getPos().toString())); try {//from w ww. jav a 2s. c o m indexWriter.addDocument(doc); } catch (IOException ex) { System.out.println( "Threw an exception trying to add the doc: " + ex.getClass() + " :: " + ex.getMessage()); } System.out.println(park.getname()); }
From source file:mw.wikidump.MakeLuceneIndex.java
License:Open Source License
/** * @param args//from w w w.jav a 2 s.co m * @throws IOException * @throws ParseException */ public static void main(String[] args) throws IOException, ParseException { String baseDir = ""; String wikiDumpFile = "enwiki-20110405-pages-articles.xml"; String luceneIndexName = "enwiki-20110405-lucene"; String logFile = luceneIndexName + ".log"; boolean bIgnoreStubs = false; String writeToTextFilesDir = ""; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-logfile")) logFile = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-ignorestubs")) bIgnoreStubs = true; if (args[i].equals("-writetotextfilesdir")) { writeToTextFilesDir = args[++i]; } } Map<String, Analyzer> analyzerPerField = new HashMap<>(); analyzerPerField.put("tokenized_title", new StandardAnalyzer()); analyzerPerField.put("contents", new StandardAnalyzer()); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField); File basePath = new File(baseDir); File luceneIndex = new File(basePath.getCanonicalPath() + File.separator + luceneIndexName); logFile = basePath.getCanonicalPath() + File.separator + logFile; // log to file and console: // PlainLogger logger = new PlainLogger( logFile ); // log only to console: PlainLogger logger = new PlainLogger(); logger.log("Work directory: " + basePath.getCanonicalPath()); logger.log("Lucene index: " + luceneIndexName); logger.log("Wikipedia dumpfile: " + wikiDumpFile); logger.log(""); if (bIgnoreStubs) logger.log("Ignoring stubs"); else logger.log("Including stubs"); logger.log(""); // create the index Directory indexDirectory = FSDirectory.open(FileSystems.getDefault().getPath(baseDir)); IndexWriter indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(analyzer)); Extractor wikidumpExtractor = new Extractor(basePath.getCanonicalPath() + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setTokenized(true); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } Document doc = new Document(); ++iArticleCount; doc.add(new StoredField("path", String.format("%d", iArticleCount))); wikidumpExtractor.setTitleSeparator("_"); String title = wikidumpExtractor.getPageTitle(false).toLowerCase(); doc.add(new Field("title", title, fieldType)); wikidumpExtractor.setTitleSeparator(" "); doc.add(new Field("tokenized_title", wikidumpExtractor.getPageTitle(false).toLowerCase(), fieldType)); doc.add(new Field("categories", wikidumpExtractor.getPageCategories().toLowerCase(), fieldType)); doc.add(new Field("links", wikidumpExtractor.getPageLinks().toLowerCase(), fieldType)); doc.add(new Field("contents", wikidumpExtractor.getPageAbstract().toLowerCase(), fieldType)); indexWriter.addDocument(doc); if (!writeToTextFilesDir.isEmpty()) { String fileName = doc.get("title"); fileName = fileName.replace('/', '_'); writeToTextFile(writeToTextFilesDir, fileName, doc.get("contents")); } if (iArticleCount % 50000 == 0) { logger.add(iArticleCount + " (" + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s) "); iTime = System.nanoTime(); if (iArticleCount % 250000 == 0) { try { indexWriter.commit(); logger.add( "-- commit. Skipped page count " + iSkippedPageCount + " (+ " + iStubs + " stubs)"); logger.log(String.format(", time %sm", NanoTimeFormatter.getM(System.nanoTime() - iStartTime))); } catch (Exception e) { e.printStackTrace(); } } } } logger.log(""); logger.log( String.format("Overall time %s minutes, ", NanoTimeFormatter.getM(System.nanoTime() - iStartTime))); logger.add("collected " + iArticleCount + " articles, "); logger.add("skipped " + iSkippedPageCount + " nonarticle pages,"); logger.log("skipped " + iStubs + " stubs."); logger.log(""); iTime = System.nanoTime(); logger.add(" closing..."); indexWriter.close(); logger.log(" done in " + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s."); logger.close(); System.exit(0); }
From source file:net.chwise.indexing.DocumentFromWikitextExtractor.java
License:Open Source License
public Document getLuceneDocument(SimpleFieldToFieldProcessor simpleFieldToFieldProcessor, String pathStr, String title, String smiles, String wikitext) throws LinkTargetException, EngineException { // Retrieve a page PageTitle pageTitle = PageTitle.make(config, pathStr); PageId pageId = new PageId(pageTitle, -1); // Compile the retrieved page EngProcessedPage cp = engine.postprocess(pageId, wikitext, null); //Check chembox in the begining TextConverter markupStripConverter = new TextConverter(config, wrapCol); String text = (String) markupStripConverter.go(cp.getPage()); InfoBoxDataExtractor infoBoxDataExtractor = new InfoBoxDataExtractor(); Map<String, String> infoboxFields = (Map<String, String>) infoBoxDataExtractor.go(cp.getPage()); //Update statistics if required if (calculateStatistics) { for (String key : infoboxFields.keySet()) { int count = infoboxKeysCout.containsKey(key) ? infoboxKeysCout.get(key) : 0; infoboxKeysCout.put(key, count + 1); }//www .j a v a2 s. co m } //Create lucene document Document document = new Document(); document.add(new TextField(DocDefinitions.TITLE_FIELD_NAME, title, Field.Store.YES)); document.add(new TextField(DocDefinitions.TEXT_FIELD_NAME, text, Field.Store.YES)); document.add(new TextField(DocDefinitions.STRUCTURE_SMILES_FIELD_NAME, smiles, Field.Store.YES)); document.add(new TextField(DocDefinitions.URL_FIELD_NAME, "#", Field.Store.YES)); document.add(new StoredField(DocDefinitions.STRUCTURE_MOL_FIELD_NAME, toMOLConverter.MOLChargesKludge(toMOLConverter.convert(smiles)))); simpleFieldToFieldProcessor.process(infoboxFields, document); return document; }
From source file:net.semanticmetadata.lire.builders.AbstractLocalDocumentBuilder.java
License:Open Source License
/** * Creates the Lucene Fiels with the vector representation of list of local features. * @param listOfLocalFeatures is the list of local features. * @param extractorItem is the extractor that was used to extract the features. * @param listOfCodebooks is the list which can contain one or more codebooks to be used for the aggregation of the local features. * @return Lucene Fields with the vector representation of the list of local features. *///from w w w .j a v a 2 s . c o m public Field[] createLocalDescriptorFields(List<? extends LocalFeature> listOfLocalFeatures, ExtractorItem extractorItem, LinkedList<Cluster[]> listOfCodebooks) { Field[] result = new Field[listOfCodebooks.size() * 2]; int count = 0; for (Cluster[] codebook : listOfCodebooks) { aggregator.createVectorRepresentation(listOfLocalFeatures, codebook); result[count] = new StoredField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[0], aggregator.getByteVectorRepresentation()); result[count + 1] = new TextField(fieldNamesDictionary.get(extractorItem).get(codebook.length)[1], aggregator.getStringVectorRepresentation(), Field.Store.YES); count += 2; } return result; }
From source file:net.semanticmetadata.lire.builders.GlobalDocumentBuilder.java
License:Open Source License
/** * Extracts the global feature and returns the Lucene Fields for the selected image. * * @param image is the selected image. * @param extractorItem is the extractor to be used to extract the features. * @return Lucene Fields.// www. j av a 2 s.c o m */ private Field[] getGlobalDescriptorFields(BufferedImage image, ExtractorItem extractorItem) { Field[] result; // if (hashingEnabled) result = new Field[2]; // else result = new Field[1]; Field hash = null; Field vector = null; GlobalFeature globalFeature = extractGlobalFeature(image, (GlobalFeature) extractorItem.getExtractorInstance()); if (!useDocValues) { // TODO: Stored field is compressed and upon search decompression takes a lot of time (> 50% with a small index with 50k images). Find something else ... vector = new StoredField(extractorItems.get(extractorItem)[0], new BytesRef(globalFeature.getByteArrayRepresentation())); } else { // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely. vector = new BinaryDocValuesField(extractorItems.get(extractorItem)[0], new BytesRef(globalFeature.getByteArrayRepresentation())); } // if BitSampling is an issue we add a field with the given hashFunctionsFileName and the suffix "hash": if (hashingEnabled) { // TODO: check eventually if there is a more compressed string version of the integers. i.e. the hex string if (globalFeature.getFeatureVector().length <= 3100) { int[] hashes; if (hashingMode == HashingMode.BitSampling) { hashes = BitSampling.generateHashes(globalFeature.getFeatureVector()); hash = new TextField(extractorItems.get(extractorItem)[1], SerializationUtils.arrayToString(hashes), Field.Store.YES); } else if (hashingMode == HashingMode.LSH) { hashes = LocalitySensitiveHashing.generateHashes(globalFeature.getFeatureVector()); hash = new TextField(extractorItems.get(extractorItem)[1], SerializationUtils.arrayToString(hashes), Field.Store.YES); } else if (hashingMode == HashingMode.MetricSpaces) { if (MetricSpaces.supportsFeature(globalFeature)) { // the name of the field is set at "addExtractor" time. hash = new TextField(extractorItems.get(extractorItem)[1], MetricSpaces.generateHashString(globalFeature), Field.Store.YES); } } } else System.err.println("Could not create hashes, feature vector too long: " + globalFeature.getFeatureVector().length + " (" + globalFeature.getClass().getName() + ")"); } if (hash != null) result = new Field[] { vector, hash }; else result = new Field[] { vector }; return result; }
From source file:net.semanticmetadata.lire.GeneralTest.java
License:Open Source License
public void testCreateAndSearchSmallIndex() throws IOException { for (int i = 0, buildersLength = builders.length; i < buildersLength; i++) { DocumentBuilder b = builders[i]; // create an index with a specific builder: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-small", true); for (String identifier : testFiles) { Document doc = b.createDocument(new FileInputStream(testFilesPath + identifier), identifier); doc.add(new StoredField("video_file", "surgery1.mp4")); doc.add(new StoredField("timestamp", "25")); iw.addDocument(doc);/* w w w.j a v a 2s . co m*/ } iw.close(); ImageSearcher s = searchers[i]; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-small"))); for (int k = 0; k < reader.maxDoc(); k++) { Document query = reader.document(k); ImageSearchHits hits = s.search(query, reader); for (int y = 0; y < hits.length(); y++) { Document result = hits.doc(y); if (y == 0) { // check if the first result is the query: assertEquals(result.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] .equals(query.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), true); System.out.println(result.getValues("video_file")[0]); } else { // check if they are ordered by distance: assertEquals(hits.score(y) < hits.score(y - 1), true); } } } } }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
private void createVisualWords(Document d, LireFeature f) { double[] tmpHist = new double[numClusters]; Arrays.fill(tmpHist, 0d);// www . j av a 2 s. c om IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature((Histogram) f)]++; } //quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist))); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // for debugging .. // System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist)); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java
License:Open Source License
private void createVisualWords(Document d, LireFeature f) { double[] tmpHist = new double[numClusters]; IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature((Histogram) f, clusters)]++; }//from ww w .ja v a 2 s . c o m d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist))); //quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // for debugging .. // System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(tmpHist)); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from w w w.j a v a 2s. com */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java
License:Open Source License
private void createVisualWords(Document d, LireFeature f) { IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(vladFieldName);//from w ww .ja v a 2 s . c o m d.removeField(vladHistFieldName); double[] vlad = new double[clusters.length * (clusters[0].getMean()).length]; Arrays.fill(vlad, 0d); int clusterIndex; double[] mean; // VLAD - Vector of Locally Aggregated Descriptors for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); clusterIndex = clusterForFeature((Histogram) f); // System.out.println("clusterIndex = " + clusterIndex); mean = clusters[clusterIndex].getMean(); for (int k = 0; k < f.getDoubleHistogram().length; k++) { // System.out.println((clusterIndex*f.getDoubleHistogram().length+k) + " - mean: " + mean.length + " - feature: " + f.getDoubleHistogram().length); vlad[clusterIndex * f.getDoubleHistogram().length + k] += f.getDoubleHistogram()[k] - mean[k]; } } normalize(vlad); GenericDoubleLireFeature feat = new GenericDoubleLireFeature(); feat.setData(vlad); // System.out.println(feat.getStringRepresentation()); d.add(new TextField(vladFieldName, feat.getStringRepresentation(), Field.Store.YES)); d.add(new StoredField(vladHistFieldName, feat.getByteArrayRepresentation())); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // for debugging .. // System.out.println(d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " " + Arrays.toString(vlad)); }