List of usage examples for org.apache.lucene.document StoredField StoredField
public StoredField(String name, double value)
From source file:org.xwiki.contrib.repository.pypi.internal.searching.PypiPackageListIndexUpdateTask.java
License:Open Source License
private Document createNewDocument(String packageName, String version) { Document document = new Document(); document.add(new TextField(LuceneParameters.PACKAGE_NAME, packageName, Field.Store.YES)); document.add(new StringField(LuceneParameters.ID, packageName, Field.Store.YES)); document.add(new StoredField(LuceneParameters.VERSION, version)); return document; }
From source file:part1.IndexBusiness.java
/** * @param args//from w ww .j ava 2 s .co m * the command line arguments */ public static void main(String[] args) throws FileNotFoundException, IOException { // TODO code application logic here Analyzer analyzer = new StandardAnalyzer(); Directory dir = FSDirectory.open(Paths.get("index_business")); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, iwc); Gson gson = new Gson(); File f = new File("/Users/arpitkhandelwal/Downloads/yelp_dataset_challenge_academic_dataset/business.json"); BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; int count = 0; try { while ((line = reader.readLine()) != null) { JsonObject r = gson.fromJson(line, JsonObject.class); // System.out.println(r.get("attributes").toString()); JsonObject a = gson.fromJson(r.get("attributes").toString(), JsonObject.class); // a.keySet(); Set alist = a.entrySet(); Business r1 = gson.fromJson(line, Business.class); Document luceneDoc = new Document(); luceneDoc.add(new TextField("fulladdress", r1.full_address, Field.Store.YES)); luceneDoc.add(new StringField("businessid", r1.business_id, Field.Store.YES)); luceneDoc.add(new StringField("city", r1.city, Field.Store.YES)); luceneDoc.add(new StringField("open", String.valueOf(r1.open), Field.Store.YES)); luceneDoc.add(new StringField("name", r1.name, Field.Store.YES)); luceneDoc.add(new StoredField("longitude", r1.longitude)); luceneDoc.add(new StoredField("latitude", r1.latitude)); luceneDoc.add(new StoredField("reviewcount", r1.review_count)); luceneDoc.add(new StringField("state", r1.state, Field.Store.YES)); luceneDoc.add(new StoredField("stars", r1.stars)); luceneDoc.add(new StringField("type", r1.type, Field.Store.YES)); for (String c : r1.categories) luceneDoc.add(new SortedSetDocValuesField("category", new BytesRef(c))); for (String c : r1.neighborhoods) luceneDoc.add(new SortedSetDocValuesField("neighborhoods", new BytesRef(c))); for (Object attribute : alist) { String attributedetail[] = attribute.toString().split("="); String key = attributedetail[0].trim(); String value = attributedetail[1].trim(); luceneDoc.add(new TextField(key, value, Field.Store.YES)); } writer.addDocument(luceneDoc); } } catch (Exception e) { System.out.println(line); e.printStackTrace(); } finally { reader.close(); writer.close(); } }
From source file:perf.PrintPerFieldHeapUsage.java
License:Apache License
public static void main(String[] args) throws IOException { Directory dir = FSDirectory.open(Paths.get("fields")); int fieldUpto; IndexWriterConfig iwc;/*from w w w. ja va 2 s . c om*/ IndexWriter w; long t0; IndexReader r; // Stored field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new StoredField("f" + fieldUpto, "text" + i)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StoredField: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Indexed StringField: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new StringField("f" + fieldUpto, "text" + i, Field.Store.NO)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StringField: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Numeric DV field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new NumericDocValuesField("f" + fieldUpto, i)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique NumericDocValuesField, latent: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); // Now force lazy loading of all the DV fields: for (int i = 0; i < FIELD_COUNT; i++) { MultiDocValues.getNumericValues(r, "f" + i); } System.out.println(String.format(Locale.ROOT, "Bytes per unique NumericDocValuesField, loaded: %.1f", (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Sorted DV field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new SortedDocValuesField("f" + fieldUpto, new BytesRef("text" + i))); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique SortedDocValuesField, latent: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); // Now force lazy loading of all the DV fields: for (int i = 0; i < FIELD_COUNT; i++) { MultiDocValues.getSortedValues(r, "f" + i); } System.out.println(String.format(Locale.ROOT, "Bytes per unique SortedDocValuesField, loaded: %.1f", (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); dir.close(); }
From source file:perLucene.IndexerThread.java
License:Open Source License
public boolean addDoc(String language, String summary, String text, long date, String wkt, byte[] gdid, long id) { Analyzer analyzer = ha.get(language); Document doc = new Document(); doc.add(new TextField("summary", summary, Field.Store.NO)); doc.add(new TextField("text", text, Field.Store.NO)); doc.add(new LongDocValuesField("uid", id)); doc.add(new DerefBytesDocValuesField("language", new BytesRef(language))); doc.add(new LongField("date", date, Field.Store.NO)); doc.add(new StoredField("gdid", gdid)); doc.add(new LongField("id", id, Field.Store.NO)); // ?? for/*w w w. jav a 2s . co m*/ // deletions we // need // LongField to // do search on // id if (sp.addFields(doc, wkt)) { try { w.addDocument(doc, analyzer); } catch (Exception e) { System.out.println("Couldnt add Doc"); System.out.println("Stacktrace " + e.toString()); try { if (w != null) { w.close(); } } catch (IOException e1) { } System.exit(-1); } return true; } else { return false; } }
From source file:slib.sml.sm.core.measures.corpus.VocContextMatrixBuilder.java
License:Open Source License
public static void main(String[] args) throws SLIB_Ex_Critic, IOException { String[] ext = { "txt" }; List<File> files = FileUtils.listFilesForFolder("/data/tmp/wiki/", Arrays.asList(ext), 100000); List<String> docField = new ArrayList<String>(); docField.add("content"); String[] vocArray = { "lion", "panthera", "Africa", "lamb", "insecticides", "animal", "Genealogists", "rugby", "football", "Sydney", "Australia" }; Set<String> voc = new HashSet<String>(Arrays.asList(vocArray)); MatrixType matrixType = MatrixType.WORD_WORD; VocContextMatrixBuilder matrixBuilder = new VocContextMatrixBuilder(matrixType, voc, docField); for (File f : files) { Document doc = new Document(); String fileAsString = FileUtils.readFile(f.getAbsolutePath(), Charset.defaultCharset()); doc.add(new StoredField("content", fileAsString)); matrixBuilder.process(doc);/*from www. j a v a 2 s . co m*/ } // Document docA = new Document(); // docA.add(new StoredField("title", "Lion article")); // docA.add(new StoredField("content", "The lion (Panthera leo) is one of the four big cats in the genus Panthera and a member of the family Felidae. With some males exceeding 250 kg (550 lb) in weight,[4] it is the second-largest living cat after the tiger. Wild lions currently exist in sub-Saharan Africa and in Asia (where an endangered remnant population resides in Gir Forest National Park in India) while other types of lions have disappeared from North Africa and Southwest Asia in historic times. Until the late Pleistocene, about 10,000 years ago, the lion was the most widespread large land mammal after humans")); // // Document docB = new Document(); // docB.add(new StoredField("title", "Java (programming language)")); // docB.add(new StoredField("content", "Java is a computer programming language that is concurrent, class-based, object-oriented, and specifically designed to have as few implementation dependencies as possible. It is intended to let application developers \"write once, run anywhere\" (WORA), meaning that code that runs on one platform does not need to be recompiled to run on another. Java applications are typically compiled to bytecode (class file) that can run on any Java virtual machine (JVM) regardless of computer architecture. Java is, as of 2014, one of the most popular programming languages in use, particularly for client-server web applications, with a reported 9 million developers.[10][11] Java was originally developed by James Gosling at Sun Microsystems (which has since merged into Oracle Corporation) and released in 1995 as a core component of Sun Microsystems' Java platform. The language derives much of its syntax from C and C++, but it has fewer low-level facilities than either of them.")); // // Document docC = new Document(); // docC.add(new StoredField("title", "Java (programming language)")); // docC.add(new StoredField("content", "Java is a computer programming language that is concurrent, class-based, object-oriented, and specifically designed to have as few implementation dependencies as possible. It is intended to let application developers \"write once, run anywhere\" (WORA), meaning that code that runs on one platform does not need to be recompiled to run on another. Java applications are typically compiled to bytecode (class file) that can run on any Java virtual machine (JVM) regardless of computer architecture. Java is, as of 2014, one of the most popular programming languages in use, particularly for client-server web applications, with a reported 9 million developers.[10][11] Java was originally developed by James Gosling at Sun Microsystems (which has since merged into Oracle Corporation) and released in 1995 as a core component of Sun Microsystems' Java platform. The language derives much of its syntax from C and C++, but it has fewer low-level facilities than either of them.")); // // docs.add(docA); // docs.add(docB); // docs.add(docC); // matrixBuilder.buildMatrix(docs); Matrix<String, String> mat = matrixBuilder.getMatrix(); System.out.println("size: " + mat.getInternalStorage().keySet().size()); for (String s : mat.getInternalStorage().keySet()) { System.out.println( s + "\t(" + mat.getInternalStorage().get(s).size() + ")\t" + mat.getInternalStorage().get(s)); } }
From source file:spatialluceneindexer.files.LuceneWriter.java
public void addPark(Park park) { Document doc = new Document(); doc.add(new TextField("name", park.getname(), Field.Store.YES)); //First we make the shape, then we make the indexed field from it. This field can not be stored //This assumes there is always only one shape per document while there could be multiple Shape pointShape = spatialContext.makePoint(park.getPos().get(0).doubleValue(), park.getPos().get(1).doubleValue()); for (IndexableField f : spatialStrategy.createIndexableFields(pointShape)) { doc.add(f);/*from w ww .j ava 2s. c o m*/ } //now let's store the field as well - could be useful to return this to the client doc.add(new StoredField("coords", spatialContext.toString(pointShape))); try { indexWriter.addDocument(doc); } catch (IOException ex) { System.out.println( "Threw an exception trying to add the doc: " + ex.getClass() + " :: " + ex.getMessage()); } System.out.println(park.getname()); }
From source file:start.lucene.CPeptidesIndexer.java
/** * This method prepares a Document from a given line for indexing * * @param line//from w w w .jav a 2 s . co m * @return * @throws IOException */ public Document getDocument(StringBuilder line) throws IOException { Document doc = new Document(); int maxDoc = getIndexWriter().maxDoc(), id = maxDoc++; // Fill document like "name-value" pair. doc.add(new IntField(FieldName.ID, id, Field.Store.YES)); // proteinA name String[] sp = line.toString().split("\t"); // Except mass, all is StringField but not TextField because // A text field is a sequence of terms that has been tokenized and punctuation and spacing are ignored-good for keyword search // while a string field is a single term with literal character strings with all punctuation, and cannot tokenized (only for atomic values), spacing,and case preserved // StringField is always indexed since Lucene4.0 // StoredField is for storing but not indexing at all (and so, is not searchable). // StoredField(String name, String value)creates a stored-only field with the given string value doc.add(new StoredField(FieldName.PROTEINA, sp[0])); doc.add(new StoredField(FieldName.PROTEINB, sp[1])); // proteinB name doc.add(new StoredField(FieldName.PEPTIDEA, sp[2])); // peptideA sequence doc.add(new StoredField(FieldName.PEPTIDEB, sp[3])); // peptideB sequence doc.add(new StoredField(FieldName.LINKA, sp[4])); // proteinA name doc.add(new StoredField(FieldName.LINKB, sp[5])); // proteinB name doc.add(new StoredField(FieldName.FIXMODA, sp[6])); // linkerPeptideA doc.add(new StoredField(FieldName.FIXMODB, sp[7])); // linkerPeptideB doc.add(new StoredField(FieldName.VARMODA, sp[8])); // ModificationsPeptideA doc.add(new StoredField(FieldName.VARMODB, sp[9])); // ModificationsPeptideB //doc.add(new StringField("mass", sp[10], Field.Store.YES)); // Mass doc.add(new DoubleField(FieldName.MASS, Double.parseDouble(sp[10]), indexedFieldType)); if (sp.length > 11) { doc.add(new StoredField(FieldName.TYPE, sp[11])); //Type doc.add(new StoredField(FieldName.LABEL, sp[12])); // Labeling-true:Heavylabeled } return doc; }
From source file:uk.ac.open.kmi.squire.index.RDFDatasetIndexer.java
public Document indexSignature(String urlAddress, String graphName, IRDFDataset indexand, Collection<String> propertySet, boolean overwrite) { if (alreadyIndexed(urlAddress, graphName) && !overwrite) { log.warn("Already indexed: {}{}", urlAddress, graphName == null ? "" : "::" + graphName); log.warn(" ... overwrite not set, so not indexing."); return null; }/*from w w w.j ava 2 s . c o m*/ Analyzer analyzer = new StandardAnalyzer(); // = new WhitespaceAnalyzer(); IndexWriter indexWriter; /* * IndexWriterConfig.OpenMode.CREATE_OR_APPEND if used IndexWriter will create a * new index if there is not already an index at the provided path and otherwise * open the existing index. */ IndexWriterConfig config = new IndexWriterConfig(analyzer);// .setOpenMode(OpenMode.CREATE_OR_APPEND); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); try { indexWriter = new IndexWriter(getIndex(), config); } catch (IOException e) { throw new RuntimeException(e); } // For every dataset a document Document doc = new Document(); // XXX AA I think the values are so because it is assumed that Set#toString() // prints [ one, two, ... ] but can it be trusted? doc.add(new Field("URL", urlAddress, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("GraphName", graphName, Field.Store.YES, Field.Index.NOT_ANALYZED)); // doc.add(new Field("ClassSet", indexand.getClassSet().toString(), // Field.Store.YES, Field.Index.NO)); doc.add(new Field("ObjectPropertySet", indexand.getObjectPropertySet().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field("DatatypePropertySet", indexand.getDatatypePropertySet().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field("LiteralSet", indexand.getLiteralSet().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field("IndividualSet", indexand.getIndividualSet().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field("RDFVocabulary", indexand.getRDFVocabulary().toString(), Field.Store.YES, Field.Index.NO)); if (propertySet != null && !propertySet.isEmpty()) doc.add(new Field("PropertySet", propertySet.toString(), Field.Store.YES, Field.Index.NO)); // TODO the new way of using Lucene, apply to rest doc.add(new StoredField(Fieldd.ClassSet.toString(), indexand.getClassSet().toString())); JsonObject jSign = new JsonObject(); for (Entry<String, ClassSignature> entry : indexand.getClassSignatures().entrySet()) jSign.put(entry.getKey(), entry.getValue().jsonifyPaths()); try { ByteArrayOutputStream os = new ByteArrayOutputStream(); JSON.write(os, jSign); doc.add(new StoredField(Fieldd.CLASS_SIGNATURES.toString(), new String(os.toByteArray(), "UTF-8"))); } catch (UnsupportedEncodingException e1) { shutdown(indexWriter); throw new RuntimeException("UTF-8 not supported. Seriously?", e1); } // Remove the old one(s) if any Builder queryBuilder = new Builder(); queryBuilder.add(new TermQuery(new Term("URL", urlAddress)), BooleanClause.Occur.MUST); if (graphName != null && !graphName.isEmpty()) queryBuilder.add(new TermQuery(new Term("GraphName", graphName)), BooleanClause.Occur.MUST); try { indexWriter.deleteDocuments(queryBuilder.build()); indexWriter.addDocument(doc); } catch (IOException e) { throw new RuntimeException(e); } finally { shutdown(indexWriter); } return doc; }
From source file:webdocs.ClueWebDoc.java
@Override Document constructLuceneDoc() {/*from w ww. ja va2s . c om*/ Document doc = new Document(); doc.add(new Field(FIELD_ID, this.docNo, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(WTDOC_FIELD_URL, this.url, Field.Store.YES, Field.Index.NOT_ANALYZED)); // store the title and the raw html doc.add(new Field(WTDOC_FIELD_TITLE, this.title == null ? "" : this.title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); String ppHTML = html; try { ppHTML = preProcessHTML(html); } catch (Exception ex) { ex.printStackTrace(); } doc.add(new StoredField(WTDOC_FIELD_HTML, compress(ppHTML))); // the words only... no term vectors doc.add(new Field(FIELD_ANALYZED_CONTENT, this.text, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); return doc; }
From source file:webdocs.WebDocAnalyzer.java
Document constructLuceneDoc() { Document doc = new Document(); doc.add(new Field(FIELD_ID, this.docNo, Field.Store.YES, Field.Index.NOT_ANALYZED)); // store the title and the raw html doc.add(new Field(WTDOC_FIELD_TITLE, this.title == null ? "" : this.title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); doc.add(new StoredField(WTDOC_FIELD_HTML, compress(html))); // the words (also store the term vector) doc.add(new Field(FIELD_ANALYZED_CONTENT, this.text, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); return doc;//from w w w . ja v a2s . com }