List of usage examples for org.apache.lucene.document StoredField StoredField
public StoredField(String name, double value)
From source file:de.ids_mannheim.korap.index.FieldDocument.java
public void addStored(String key, String value) { doc.add(new StoredField(key, value)); }
From source file:de.ids_mannheim.korap.index.FieldDocument.java
public void addStored(String key, int value) { doc.add(new StoredField(key, value)); }
From source file:de.qaware.chronix.lucene.client.add.LuceneAddingService.java
License:Apache License
/** * Tries to cast field value (object) to a string or byte[]. * If the field value is not a string or a byte[] then the method ignores the field. * <p>// w w w. j a va 2 s .c o m * If the value is a string or byte[] than the value is warped into a matching lucene field (Field for String, * StoredField for byte[]) and added to the lucene document. * * @param document the lucene document to add the number * @param fieldName the field name * @param fieldValue the field value */ private static void handleStringsAndBytes(Document document, String fieldName, Object fieldValue) { if (fieldValue instanceof String) { document.add(new Field(fieldName, fieldValue.toString(), TextField.TYPE_STORED)); } else if (fieldValue instanceof byte[]) { document.add(new StoredField(fieldName, new BytesRef((byte[]) fieldValue))); } }
From source file:de.qaware.chronix.lucene.client.add.LuceneAddingService.java
License:Apache License
/** * Tries to cast field value (object) to a number (double, integer, float, long). * If the field value is not a number then method ignores the field. * <p>/*from w w w.j a v a2 s . co m*/ * If the value is a number than the value is warped into a matching lucene field (IntField, DoubleField, ...) * and added to the lucene document. * * @param document the lucene document to add the number * @param fieldName the field name * @param fieldValue the field value */ private static void handleNumbers(Document document, String fieldName, Object fieldValue) { if (fieldValue instanceof Double) { document.add(new StoredField(fieldName, Double.parseDouble(fieldValue.toString()))); } else if (fieldValue instanceof Integer) { document.add(new StoredField(fieldName, Integer.parseInt(fieldValue.toString()))); } else if (fieldValue instanceof Float) { document.add(new StoredField(fieldName, Float.parseFloat(fieldValue.toString()))); } else if (fieldValue instanceof Long) { document.add(new StoredField(fieldName, Long.parseLong(fieldValue.toString()))); } else { LOGGER.warn("Cloud not extract value from field {} with value {}", fieldName, fieldValue); } }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
License:Apache License
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*w w w.j a va2s. c o m*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }
From source file:di.uniba.it.tee2.index.TemporalEventIndexing.java
License:Open Source License
/** * Crea e memorizza un documento xml a partire dalla stringa fornita in * input dopo averla taggata usando HeidelTime. * * @param title/*ww w . j a v a2 s. c o m*/ * @param content * @param fileName * @param docID * @param wikiID * @param revisionID * @throws java.lang.Exception */ public void add(String title, String content, String fileName, String docID, int wikiID, int revisionID) throws Exception { TaggedText tt = null; try { tt = tempExtractor.process(content); } catch (Exception ex) { logger.log(Level.WARNING, "Error to process doc " + docID + " (skip doc)", ex); } if (tt != null) { //stores id and text (not tagged) in docrep_index (document repository) Document docrep_doc = new Document(); docrep_doc.add(new StringField("id", docID, Field.Store.YES)); docrep_doc.add(new IntField("wikiID", wikiID, Field.Store.YES)); docrep_doc.add(new IntField("revisionID", revisionID, Field.Store.YES)); docrep_doc.add(new StringField("title", title, Field.Store.YES)); docrep_doc.add(new StoredField("content", tt.getText())); docrep_doc.add(new StringField("filename", fileName, Field.Store.YES)); docrep_writer.addDocument(docrep_doc); //stores id and text (not tagged) in doc_index for search Document doc_doc = new Document(); doc_doc.add(new StringField("id", docID, Field.Store.YES)); doc_doc.add(new IntField("wikiID", wikiID, Field.Store.YES)); doc_doc.add(new IntField("revisionID", revisionID, Field.Store.YES)); doc_doc.add(new TextField("title", title, Field.Store.NO)); doc_doc.add(new TextField("content", tt.getText(), Field.Store.NO)); doc_writer.addDocument(doc_doc); logger.log(Level.FINE, "Found {0} temporal events", tt.getEvents().size()); for (TimeEvent event : tt.getEvents()) { //for each TIMEX3 store info time index //stores id, file name and text (TimeML tagged) in time_index Document time_doc = new Document(); time_doc.add(new StringField("id", docID, Field.Store.YES)); //time_doc.add(new StringField("file", fileName, Field.Store.YES)); //time_doc.add(new TextField("content", tt.getTaggedText(), Field.Store.NO)); /*FieldType ft = new FieldType(); ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStored(true); ft.setIndexed(true); ft.setStoreTermVectorPositions(true); ft.setOmitNorms(false);*/ time_doc.add(new StringField("time", event.getDateString(), Field.Store.YES)); time_doc.add(new IntField("offset_start", event.getStartOffset(), Field.Store.YES)); time_doc.add(new IntField("offset_end", event.getEndOffset(), Field.Store.YES)); time_doc.add(new TextField("context", getTimeContext(tt.getText(), event.getStartOffset(), event.getEndOffset()), Field.Store.NO)); time_writer.addDocument(time_doc); } } }
From source file:di.uniba.it.tee2.index.TemporalEventIndexingTS.java
License:Open Source License
/** * Crea e memorizza un documento xml a partire dalla stringa fornita in * input dopo averla taggata usando HeidelTime. * * @param title/* w ww. jav a2 s . co m*/ * @param content * @param fileName * @param docID */ public void add(String title, String content, String fileName, String docID, int wikiID, int revisionID) throws Exception { TaggedText tt = null; try { TemporalExtractor tempExtractor = new TemporalExtractor(lang); tempExtractor.init(); tt = tempExtractor.process(content); } catch (Exception ex) { logger.log(Level.WARNING, "Error to process doc " + docID + " (skip doc)", ex); } if (tt != null) { //stores id and text (not tagged) in docrep_index (document repository) Document docrep_doc = new Document(); docrep_doc.add(new StringField("id", docID, Field.Store.YES)); docrep_doc.add(new IntField("wikiID", wikiID, Field.Store.YES)); docrep_doc.add(new IntField("revisionID", revisionID, Field.Store.YES)); docrep_doc.add(new StringField("title", title, Field.Store.YES)); docrep_doc.add(new StoredField("content", tt.getText())); docrep_doc.add(new StringField("filename", fileName, Field.Store.YES)); docrep_writer.addDocument(docrep_doc); //stores id and text (not tagged) in doc_index for search Document doc_doc = new Document(); doc_doc.add(new StringField("id", docID, Field.Store.YES)); doc_doc.add(new IntField("wikiID", wikiID, Field.Store.YES)); doc_doc.add(new IntField("revisionID", revisionID, Field.Store.YES)); doc_doc.add(new TextField("title", title, Field.Store.NO)); doc_doc.add(new TextField("content", tt.getText(), Field.Store.NO)); doc_writer.addDocument(doc_doc); logger.log(Level.FINE, "Found {0} temporal events", tt.getEvents().size()); for (TimeEvent event : tt.getEvents()) { //for each TIMEX3 store info time index //stores id, file name and text (TimeML tagged) in time_index Document time_doc = new Document(); time_doc.add(new StringField("id", docID, Field.Store.YES)); //time_doc.add(new StringField("file", fileName, Field.Store.YES)); //time_doc.add(new TextField("content", tt.getTaggedText(), Field.Store.NO)); /*FieldType ft = new FieldType(); ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStored(true); ft.setIndexed(true); ft.setStoreTermVectorPositions(true); ft.setOmitNorms(false);*/ time_doc.add(new StringField("time", event.getDateString(), Field.Store.YES)); time_doc.add(new IntField("offset_start", event.getStartOffset(), Field.Store.YES)); time_doc.add(new IntField("offset_end", event.getEndOffset(), Field.Store.YES)); time_doc.add(new TextField("context", getTimeContext(tt.getText(), event.getStartOffset(), event.getEndOffset()), Field.Store.NO)); time_writer.addDocument(time_doc); } } }
From source file:dk.dma.msinm.lucene.SpatialLuceneTest.java
License:Open Source License
private Document newSampleDocument(int id, Shape... shapes) { Document doc = new Document(); doc.add(new IntField("id", id, Field.Store.YES)); for (Shape shape : shapes) { for (IndexableField f : strategy.createIndexableFields(shape)) { doc.add(f);//www . j a va 2 s .c o m } doc.add(new StoredField(strategy.getFieldName(), shape.toString())); } return doc; }
From source file:dk.dma.msinm.service.MessageSearchService.java
License:Open Source License
/** * Adds a shape to the document/* w w w . jav a2 s . c om*/ * @param doc the Lucene document * @param shape the shape to add * @return the updated document */ private Document addShapeSearchFields(Document doc, Shape shape) { for (IndexableField f : strategy.createIndexableFields(shape)) { doc.add(f); } doc.add(new StoredField(strategy.getFieldName(), shape.toString())); return doc; }
From source file:edu.cmu.lti.oaqa.baseqa.concept.rerank.LuceneInMemoryConceptReranker.java
License:Apache License
private static Document toLuceneDocument(ConceptSearchResult result) { Document entry = new Document(); entry.add(new StoredField("uri", result.getUri())); String names = String.join(", ", TypeUtil.getConceptNames(result.getConcept())); entry.add(new TextField("text", names, Field.Store.NO)); return entry; }