List of usage examples for org.apache.lucene.document StoredField StoredField
public StoredField(String name, double value)
From source file:edu.cmu.lti.oaqa.baseqa.document.rerank.LogRegDocumentReranker.java
License:Apache License
private static org.apache.lucene.document.Document toLuceneDocument(Document doc) { org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document(); entry.add(new StoredField("id", doc.getDocId())); entry.add(new TextField("title", doc.getTitle(), Field.Store.NO)); entry.add(new TextField("text", doc.getText(), Field.Store.NO)); return entry; }
From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java
License:Apache License
public static org.apache.lucene.document.Document createLuceneDocument(Passage passage) { org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document(); entry.add(new StoredField("hash", TypeUtil.hash(passage))); entry.add(new TextField("text", passage.getText(), Field.Store.NO)); return entry; }
From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java
License:Apache License
public static org.apache.lucene.document.Document createLuceneSectionDocument(Passage passage) { org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document(); entry.add(new StoredField("hash", TypeUtil.hash(passage))); entry.add(new TextField(passage.getBeginSection(), passage.getText(), Field.Store.NO)); return entry; }
From source file:edu.umass.cs.ciir.IndexFromGalago.java
License:Open Source License
public static void main(String[] args) throws Exception { Parameters argp = Parameters.parseArgs(args); String galagoIndexPath = null; String luceneIndexPath = null; try {// w w w. ja v a 2 s .c om galagoIndexPath = argp.getString("galagoIndex"); luceneIndexPath = argp.getString("luceneIndex"); } catch (Exception e) { System.out.println(getUsage()); return; } logger.setUseParentHandlers(false); FileHandler lfh = new FileHandler("indexing-errors.log"); SimpleFormatter formatter = new SimpleFormatter(); lfh.setFormatter(formatter); logger.addHandler(lfh); final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath)); final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus"); long total = corpus.getManifest().getLong("keyCount"); final CorpusReader.KeyIterator iterator = corpus.getIterator(); final Document.DocumentComponents dcp = Document.DocumentComponents.JustText; // Analyzer includes options for text processing Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions) TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer()); // Step 2: transforming all tokens into lowercased ones ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(), new LowerCaseFilter(ts.getTokenStream())); // Step 3: whether to remove stop words // Uncomment the following line to remove stop words // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) ); // Step 4: whether to apply stemming // Uncomment the following line to apply Krovetz or Porter stemmer // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) ); // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) ); return ts; } }; try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) { final IndexWriterConfig cfg = new IndexWriterConfig(analyzer); System.out.println("Similarity: " + cfg.getSimilarity()); cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(dir, cfg)) { iterator.forAllKeyStrings(docId -> { try { Document document = iterator.getDocument(dcp); String text = document.text; String id = document.name; System.out.println("Processing document: " + id); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new StringField("id", id, Field.Store.YES)); // this stores the actual text with tags so formatting is preserved doc.add(new StoredField("body", text)); org.jsoup.nodes.Document jsoup = Jsoup.parse(text); // tokens of the document FieldType fieldTypeText = new FieldType(); fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldTypeText.setStoreTermVectors(true); fieldTypeText.setStoreTermVectorPositions(true); fieldTypeText.setTokenized(true); fieldTypeText.setStored(false); fieldTypeText.freeze(); doc.add(new Field("tokens", jsoup.text(), fieldTypeText)); try { writer.addDocument(doc); System.out.println("Doc count: " + writer.numDocs()); } catch (IOException e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } } catch (Exception e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } }); } } System.out.println("Indexing Done. "); }
From source file:Example.lucene.TestIndexer.java
private static void addDoc(IndexWriter w, int id, String url, String title, String content) throws IOException { Document doc = new Document(); doc.add(new StoredField("id", id)); doc.add(new TextField("url", url, Field.Store.YES)); doc.add(new TextField("title", title, Field.Store.NO)); // use a string field for isbn because we don't want it tokenized doc.add(new TextField("content", content, Field.Store.NO)); //doc.add(new org.); w.addDocument(doc);/*from w ww . ja v a 2 s . c om*/ }
From source file:fr.paris.lutece.plugins.directory.service.search.DirectorySearchIndexer.java
License:Open Source License
/** * Builds a document which will be used by Lucene during the indexing of * this record//from ww w. j ava 2s . c o m * @param record the record to convert into a document * @param listContentEntry the entries in this record that are marked as * is_indexed * @param listTitleEntry the entries in this record that are marked as * is_indexed_as_title * @param listSummaryEntry the entries in this record that are marked as * is_indexed_as_summary * @param plugin the plugin object * @return a lucene document filled with the record data */ public Document getDocument(Record record, List<IEntry> listContentEntry, List<IEntry> listTitleEntry, List<IEntry> listSummaryEntry, Plugin plugin) { Document doc = new Document(); FieldType ft = new FieldType(StringField.TYPE_STORED); ft.setOmitNorms(false); FieldType ftNotStored = new FieldType(StringField.TYPE_NOT_STORED); ftNotStored.setOmitNorms(false); ftNotStored.setTokenized(false); boolean bFallback = false; //Fallback if there is no entry marker as indexed_as_title //Uses the first indexed field instead if (listTitleEntry.isEmpty() && !listContentEntry.isEmpty()) { listTitleEntry.add(listContentEntry.get(0)); bFallback = true; } String strTitle = getContentToIndex(record, listTitleEntry, plugin); //Fallback if fields were empty //Uses the first indexed field instead if (StringUtils.isBlank(strTitle) && !bFallback && !listContentEntry.isEmpty()) { listTitleEntry.clear(); listTitleEntry.add(listContentEntry.get(0)); strTitle = getContentToIndex(record, listTitleEntry, plugin); } //No more fallback. Giving up if (StringUtils.isBlank(strTitle)) { return null; } doc.add(new Field(SearchItem.FIELD_TITLE, strTitle, ft)); if (!listContentEntry.isEmpty()) { String strContent = getContentToIndex(record, listContentEntry, plugin); if (StringUtils.isNotBlank(strContent)) { doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED)); } } if (!listSummaryEntry.isEmpty()) { String strSummary = getContentToIndex(record, listSummaryEntry, plugin); if (StringUtils.isNotBlank(strSummary)) { doc.add(new StoredField(SearchItem.FIELD_SUMMARY, strSummary)); } } String strRoleKey = record.getRoleKey(); if (StringUtils.isBlank(strRoleKey)) { strRoleKey = ROLE_NONE; } doc.add(new Field(SearchItem.FIELD_ROLE, strRoleKey, ft)); String strDate = DateTools.dateToString(record.getDateCreation(), DateTools.Resolution.DAY); doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft)); String strDateModification = DateTools.dateToString(record.getDateModification(), DateTools.Resolution.DAY); doc.add(new Field(SearchItem.FIELD_DATE, strDateModification, ft)); doc.add(new Field(SearchItem.FIELD_TYPE, DIRECTORY, ft)); UrlItem url = new UrlItem(AppPathService.getPortalUrl()); url.addParameter(XPageAppService.PARAM_XPAGE_APP, DIRECTORY); url.addParameter(PARAMETER_ID_DIRECTORY_RECORD, record.getIdRecord()); url.addParameter(PARAMETER_VIEW_DIRECTORY_RECORD, ""); doc.add(new Field(SearchItem.FIELD_URL, url.getUrl(), ft)); //Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with question/answer, it is indexed, but it is not // tokenized prior to indexing. String strUID = Integer.toString(record.getIdRecord()) + "_" + SHORT_NAME; doc.add(new Field(SearchItem.FIELD_UID, strUID, ftNotStored)); return doc; }
From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java
License:Open Source License
/** * Builds a document which will be used by Lucene during the indexing of the * pages of the site with the following//from w ww.ja v a2 s .c o m * fields : summary, uid, url, contents, title and description. * * @param document the document to index * @param strUrl the url of the documents * @param strRole the lutece role of the page associate to the document * @param strPortletDocumentId the document id concatened to the id portlet * with a & in the middle * @return the built Document * @throws IOException The IO Exception * @throws InterruptedException The InterruptedException */ public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole, String strPortletDocumentId) throws IOException, InterruptedException { // make a new, empty document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); FieldType ft = new FieldType(StringField.TYPE_STORED); ft.setOmitNorms(false); // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the document, but is not searchable. doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft)); // Add the PortletDocumentId as a field named "document_portlet_id". doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft)); // Add the last modified date of the file a field named "modified". // Use a field that is indexed (i.e. searchable), but don't tokenize // the field into words. String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY); doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft)); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. String strIdDocument = String.valueOf(document.getId()); doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft)); String strContentToIndex = getContentToIndex(document); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try { new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext()); } catch (SAXException e) { throw new AppException("Error during document parsing."); } catch (TikaException e) { throw new AppException("Error during document parsing."); } //the content of the article is recovered in the parser because this one //had replaced the encoded caracters (as é) by the corresponding special caracter (as ?) String strContent = handler.toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED)); // Add the title as a separate Text field, so that it can be searched // separately. FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.setOmitNorms(true); doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2)); doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft)); doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft)); // add metadata (mapped to summary) doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED)); doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary())); // return the document return doc; }
From source file:imgProc.SiftDocumentBuilder.java
License:Open Source License
public Field[] createDescriptorFields(BufferedImage image) { Field[] result = null;//from w w w . j a va2 s .c om try { // extract features from image: List<Feature> features = extractor.computeSiftFeatures(image); result = new Field[features.size()]; int count = 0; // create new document: for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) { Feature f = fit.next(); result[count] = new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getStringRepresentation()); System.out.println("Scale:" + f.scale); count++; } } catch (IOException e) { //logger.severe(e.getMessage()); } return result; }
From source file:imgProc.SiftDocumentBuilder.java
License:Open Source License
public Document createDocument(BufferedImage image, String identifier) { Document doc = null;/* w ww . j av a 2s . c o m*/ try { // extract features from image: List<Feature> features = extractor.computeSiftFeatures(image); // create new document: doc = new Document(); for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) { Feature f = fit.next(); // add each feature to the document: doc.add(new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getByteArrayRepresentation())); } if (identifier != null) doc.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES)); } catch (IOException e) { // logger.severe(e.getMessage()); } return doc; }
From source file:index.IndexDirectoryBuilder.java
License:Apache License
/** * Builds a Lucene document to be added to the index based on a * specified name for the location and the corresponding * {@link GeoName} object.//from w w w . ja v a 2s . c om * * @param name name to serve as index key * @param geonameEntry string from GeoNames gazetteer * @param geonameID unique identifier (for quick look-up) * @param population number of inhabitants (used for scoring) * @return document to be added to the index */ public static Document buildDoc(String name, String geonameEntry, int geonameID, Long population) { // in case you're wondering, yes, this is a non-standard use of // the Lucene Document construct :) Document doc = new Document(); // this is essentially the key we'll try to match location // names against doc.add(new TextField("indexName", name, Field.Store.YES)); // this is the payload we'll return when matching location // names to gazetteer records doc.add(new StoredField("geoname", geonameEntry)); // TODO: use geonameID to link administrative subdivisions to // each other doc.add(new IntField("geonameID", geonameID, Field.Store.YES)); // we'll initially sort match results based on population doc.add(new LongField("population", population, Field.Store.YES)); logger.debug("Adding to index: " + name); return doc; }