Example usage for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value)

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:edu.cmu.lti.oaqa.baseqa.document.rerank.LogRegDocumentReranker.java

License:Apache License

private static org.apache.lucene.document.Document toLuceneDocument(Document doc) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("id", doc.getDocId()));
    entry.add(new TextField("title", doc.getTitle(), Field.Store.NO));
    entry.add(new TextField("text", doc.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java

License:Apache License

public static org.apache.lucene.document.Document createLuceneDocument(Passage passage) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("hash", TypeUtil.hash(passage)));
    entry.add(new TextField("text", passage.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java

License:Apache License

public static org.apache.lucene.document.Document createLuceneSectionDocument(Passage passage) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("hash", TypeUtil.hash(passage)));
    entry.add(new TextField(passage.getBeginSection(), passage.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {//  w w  w.  ja v a 2 s  .c  om
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:Example.lucene.TestIndexer.java

private static void addDoc(IndexWriter w, int id, String url, String title, String content) throws IOException {
    Document doc = new Document();
    doc.add(new StoredField("id", id));
    doc.add(new TextField("url", url, Field.Store.YES));
    doc.add(new TextField("title", title, Field.Store.NO));
    // use a string field for isbn because we don't want it tokenized
    doc.add(new TextField("content", content, Field.Store.NO));

    //doc.add(new org.);
    w.addDocument(doc);/*from  w  ww  . ja v a 2 s . c  om*/
}

From source file:fr.paris.lutece.plugins.directory.service.search.DirectorySearchIndexer.java

License:Open Source License

/**
 * Builds a document which will be used by Lucene during the indexing of
 * this record//from ww w. j  ava  2s .  c  o  m
 * @param record the record to convert into a document
 * @param listContentEntry the entries in this record that are marked as
 *            is_indexed
 * @param listTitleEntry the entries in this record that are marked as
 *            is_indexed_as_title
 * @param listSummaryEntry the entries in this record that are marked as
 *            is_indexed_as_summary
 * @param plugin the plugin object
 * @return a lucene document filled with the record data
 */
public Document getDocument(Record record, List<IEntry> listContentEntry, List<IEntry> listTitleEntry,
        List<IEntry> listSummaryEntry, Plugin plugin) {
    Document doc = new Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    FieldType ftNotStored = new FieldType(StringField.TYPE_NOT_STORED);
    ftNotStored.setOmitNorms(false);
    ftNotStored.setTokenized(false);

    boolean bFallback = false;

    //Fallback if there is no entry marker as indexed_as_title
    //Uses the first indexed field instead
    if (listTitleEntry.isEmpty() && !listContentEntry.isEmpty()) {
        listTitleEntry.add(listContentEntry.get(0));
        bFallback = true;
    }

    String strTitle = getContentToIndex(record, listTitleEntry, plugin);

    //Fallback if fields were empty
    //Uses the first indexed field instead
    if (StringUtils.isBlank(strTitle) && !bFallback && !listContentEntry.isEmpty()) {
        listTitleEntry.clear();
        listTitleEntry.add(listContentEntry.get(0));
        strTitle = getContentToIndex(record, listTitleEntry, plugin);
    }

    //No more fallback. Giving up
    if (StringUtils.isBlank(strTitle)) {
        return null;
    }

    doc.add(new Field(SearchItem.FIELD_TITLE, strTitle, ft));

    if (!listContentEntry.isEmpty()) {
        String strContent = getContentToIndex(record, listContentEntry, plugin);

        if (StringUtils.isNotBlank(strContent)) {
            doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));
        }
    }

    if (!listSummaryEntry.isEmpty()) {
        String strSummary = getContentToIndex(record, listSummaryEntry, plugin);

        if (StringUtils.isNotBlank(strSummary)) {
            doc.add(new StoredField(SearchItem.FIELD_SUMMARY, strSummary));
        }
    }

    String strRoleKey = record.getRoleKey();

    if (StringUtils.isBlank(strRoleKey)) {
        strRoleKey = ROLE_NONE;
    }

    doc.add(new Field(SearchItem.FIELD_ROLE, strRoleKey, ft));

    String strDate = DateTools.dateToString(record.getDateCreation(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    String strDateModification = DateTools.dateToString(record.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDateModification, ft));

    doc.add(new Field(SearchItem.FIELD_TYPE, DIRECTORY, ft));

    UrlItem url = new UrlItem(AppPathService.getPortalUrl());
    url.addParameter(XPageAppService.PARAM_XPAGE_APP, DIRECTORY);
    url.addParameter(PARAMETER_ID_DIRECTORY_RECORD, record.getIdRecord());
    url.addParameter(PARAMETER_VIEW_DIRECTORY_RECORD, "");
    doc.add(new Field(SearchItem.FIELD_URL, url.getUrl(), ft));

    //Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with question/answer, it is indexed, but it is not
    // tokenized prior to indexing.
    String strUID = Integer.toString(record.getIdRecord()) + "_" + SHORT_NAME;
    doc.add(new Field(SearchItem.FIELD_UID, strUID, ftNotStored));

    return doc;
}

From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java

License:Open Source License

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following//from   w  ww.ja v  a2  s  .c  o  m
 * fields : summary, uid, url, contents, title and description.
 *
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole,
        String strPortletDocumentId) throws IOException, InterruptedException {
    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));

    // Add the PortletDocumentId as a field named "document_portlet_id".  
    doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdDocument = String.valueOf(document.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    String strContent = handler.toString();

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    FieldType ft2 = new FieldType(TextField.TYPE_STORED);
    ft2.setOmitNorms(true);
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2));

    doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // add metadata (mapped to summary)
    doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED));
    doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary()));

    // return the document
    return doc;
}

From source file:imgProc.SiftDocumentBuilder.java

License:Open Source License

public Field[] createDescriptorFields(BufferedImage image) {
    Field[] result = null;//from   w w  w .  j  a va2  s .c om
    try {
        // extract features from image:
        List<Feature> features = extractor.computeSiftFeatures(image);
        result = new Field[features.size()];
        int count = 0;
        // create new document:
        for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) {
            Feature f = fit.next();
            result[count] = new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getStringRepresentation());
            System.out.println("Scale:" + f.scale);

            count++;

        }
    } catch (IOException e) {
        //logger.severe(e.getMessage());
    }
    return result;
}

From source file:imgProc.SiftDocumentBuilder.java

License:Open Source License

public Document createDocument(BufferedImage image, String identifier) {
    Document doc = null;/*  w ww  .  j av a  2s  .  c  o  m*/
    try {
        // extract features from image:
        List<Feature> features = extractor.computeSiftFeatures(image);
        // create new document:
        doc = new Document();
        for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) {
            Feature f = fit.next();
            // add each feature to the document:
            doc.add(new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getByteArrayRepresentation()));
        }
        if (identifier != null)
            doc.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES));
    } catch (IOException e) {
        // logger.severe(e.getMessage());
    }
    return doc;
}

From source file:index.IndexDirectoryBuilder.java

License:Apache License

/**
 * Builds a Lucene document to be added to the index based on a
 * specified name for the location and the corresponding
 * {@link GeoName} object.//from   w w w .  ja v a  2s  .  c  om
 * 
 * @param name          name to serve as index key
 * @param geonameEntry  string from GeoNames gazetteer
 * @param geonameID     unique identifier (for quick look-up)
 * @param population    number of inhabitants (used for scoring)
 * @return              document to be added to the index
 */
public static Document buildDoc(String name, String geonameEntry, int geonameID, Long population) {
    // in case you're wondering, yes, this is a non-standard use of
    // the Lucene Document construct :)
    Document doc = new Document();

    // this is essentially the key we'll try to match location
    // names against
    doc.add(new TextField("indexName", name, Field.Store.YES));

    // this is the payload we'll return when matching location
    // names to gazetteer records
    doc.add(new StoredField("geoname", geonameEntry));

    // TODO: use geonameID to link administrative subdivisions to
    //       each other
    doc.add(new IntField("geonameID", geonameID, Field.Store.YES));

    // we'll initially sort match results based on population
    doc.add(new LongField("population", population, Field.Store.YES));

    logger.debug("Adding to index: " + name);

    return doc;
}