List of usage examples for org.apache.lucene.document StoredField StoredField
public StoredField(String name, double value)
From source file:index.IndexOmimtsv.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);/*ww w .j av a 2 s . co m*/ // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("20110227030432", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. InputStreamReader ipsr = new InputStreamReader(stream); BufferedReader br = new BufferedReader(ipsr); String line = br.readLine(); int cpt = 0; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split("\t"); if (tokens.length > 6) { String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim(); if (id.matches("^[0-9]*")) { doc = new Document(); cpt++; doc.add(new TextField("ID", id, Field.Store.NO)); if (!tokens[5].trim().matches("^C[0-9].*")) { for (String token : tokens) { if (token.trim().matches("^C[0-9].*")) { doc.add(new StoredField("CUI", token.trim())); break; } } if (doc.getFields().size() != 2) doc.add(new StoredField("CUI", "")); } else doc.add(new StoredField("CUI", tokens[5].trim())); doc.add(new StoredField("Label", tokens[1].trim())); writer.addDocument(doc); } } } System.out.println("Nombre d'lments : " + cpt); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument( new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv", file.toString()), doc); } } }
From source file:indexer.DocVector.java
public Document constructDoc() { Document doc = new Document(); doc.add(new Field(FIELD_ID, docName == null ? "" : docName, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Store the vectors as byte arrays (in binary format rather than text format // which takes more space... doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x))); //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); return doc;//w ww .ja va 2s .c o m }
From source file:indexer.DocVector.java
public Document constructDoc(DocVector wholeDocvec) { Document doc = new Document(); doc.add(new Field(FIELD_SUBVEC_ID, docName == null ? "" : docName, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(FIELD_ID, wholeDocvec.docName == null ? "" : wholeDocvec.docName, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Store the vectors as byte arrays (in binary format rather than text format // which takes more space... doc.add(new StoredField(FIELD_PARENT_VEC, wholeDocvec.getVecBytes(x))); doc.add(new StoredField(FIELD_VEC, this.getVecBytes(x))); //doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field(FIELD_CELL_ID, quantized, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); return doc;//from w ww. j ava 2s . co m }
From source file:io.anserini.embeddings.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); long startTime = System.currentTimeMillis(); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); final IndexWriter writer = new IndexWriter(directory, config); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null;//from ww w . j a va 2 s.co m bRdr.readLine(); Document document = new Document(); ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); int cnt = 0; while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO)); String[] parts = termEmbedding[1].split(" "); for (int i = 0; i < parts.length; ++i) { byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array()); } document.add(new StoredField(FIELD_BODY, byteStream.toByteArray())); byteStream.flush(); byteStream.reset(); writer.addDocument(document); document.clear(); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " terms indexed"); } } LOG.info(String.format("Total of %s terms added", cnt)); try { writer.commit(); writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { LOG.error(e); } } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); }
From source file:io.anserini.embeddings.search.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); final Directory dir = FSDirectory.open(indexPath); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setUseCompoundFile(false);//from w ww.jav a 2 s . c o m config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); Document document = new Document(); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null; bRdr.readLine(); while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES)); document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1])); } }
From source file:io.anserini.index.generator.LuceneDocumentGenerator.java
License:Apache License
public Document createDocument(SourceDocument src) { String id = src.id();//from w w w .j a v a 2 s . co m String contents; try { // If there's a transform, use it. contents = transform != null ? transform.apply(src.content()) : src.content(); } catch (Exception e) { LOG.error("Error extracting document text, skipping document: " + id, e); counters.errors.incrementAndGet(); return null; } if (contents.trim().length() == 0) { LOG.info("Empty document: " + id); counters.emptyDocuments.incrementAndGet(); return null; } // make a new, empty document Document document = new Document(); // document id document.add(new StringField(FIELD_ID, id, Field.Store.YES)); if (args.storeRawDocs) { document.add(new StoredField(FIELD_RAW, src.content())); } FieldType fieldType = new FieldType(); fieldType.setStored(args.storeTransformedDocs); // Are we storing document vectors? if (args.storeDocvectors) { fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); } // Are we building a "positional" or "count" index? if (args.storePositions) { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } document.add(new Field(FIELD_BODY, contents, fieldType)); return document; }
From source file:io.anserini.index.generator.TweetGenerator.java
License:Apache License
@Override public Document createDocument(TweetCollection.Document tweetDoc) { String id = tweetDoc.id();/*from w w w . j av a2s . co m*/ if (tweetDoc.content().trim().isEmpty()) { counters.empty.incrementAndGet(); return null; } final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim()); if (!result.isValid) { counters.errors.incrementAndGet(); return null; } String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end); if (!args.tweetKeepUrls) { final Extractor extractor = new Extractor(); final List<String> urls = extractor.extractURLs(text); for (String url : urls) { text = text.replaceAll(url, ""); } } text = text.trim(); if (text.isEmpty()) { counters.empty.incrementAndGet(); return null; } // Skip deletes tweetids. if (deletes != null && deletes.contains(id)) { counters.skipped.incrementAndGet(); return null; } if (tweetDoc.getIdLong() > args.tweetMaxId) { counters.skipped.incrementAndGet(); return null; } if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) { counters.skipped.incrementAndGet(); return null; } Document doc = new Document(); doc.add(new StringField(FIELD_ID, id, Field.Store.YES)); // We need this to break scoring ties. doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong())); doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong())); tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch))); doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO)); doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount())); doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount())); doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount())); tweetDoc.getInReplyToStatusId().ifPresent(rid -> { doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid)); tweetDoc.getInReplyToUserId() .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid))); }); tweetDoc.getRetweetedStatusId().ifPresent(rid -> { doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid)); tweetDoc.getRetweetedUserId() .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid))); tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc))); }); tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO))); if (args.storeRawDocs) { // store the raw json string as one single field doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString())); } FieldType fieldType = new FieldType(); fieldType.setStored(args.storeTransformedDocs); // Are we storing document vectors? if (args.storeDocvectors) { fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); } // Are we building a "positional" or "count" index? if (args.storePositions) { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else { fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } doc.add(new Field(FIELD_BODY, text, fieldType)); return doc; }
From source file:io.datalayer.lucene.helper.AosUtil.java
License:Apache License
public static void indexNumbersMethod() { new StoredField("size", 4096); new StoredField("price", 10.99); new StoredField("author", "Arthur C. Clark"); }
From source file:io.datalayer.lucene.helper.AosUtil.java
License:Apache License
/** * #1 Good domain boost factor: 1.5/* w w w . j av a 2 s . c o m*/ * * #2 Bad domain boost factor: 0.1 */ public void docBoostMethod() throws IOException { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46, AosAnalyser.NO_LIMIT_TOKEN_COUNT_SIMPLE_ANALYSER); IndexWriter writer = new IndexWriter(dir, conf); Document doc = new Document(); String senderEmail = getSenderEmail(); String senderName = getSenderName(); String subject = getSubject(); String body = getBody(); doc.add(new StoredField("senderEmail", senderEmail)); doc.add(new Field("senderName", senderName, AosFieldType.INDEXED_STORED_TERMVECTORS)); doc.add(new Field("subject", subject, AosFieldType.INDEXED_STORED_TERMVECTORS)); doc.add(new Field("body", body, AosFieldType.INDEXED_STORED_TERMVECTORS)); String lowerDomain = getSenderDomain().toLowerCase(); if (isImportant(lowerDomain)) { // doc.setBoost(1.5F); } else if (isUnimportant(lowerDomain)) { // doc.setBoost(0.1F); } writer.addDocument(doc); writer.close(); }
From source file:io.datalayer.lucene.helper.AosUtil.java
License:Apache License
public void numberField() { Document doc = new Document(); doc.add(new StoredField("price", 19.99)); }