Example usage for org.apache.lucene.document StoredField StoredField

List of usage examples for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value) 

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:edu.cmu.lti.oaqa.baseqa.document.rerank.LogRegDocumentReranker.java

License:Apache License

private static org.apache.lucene.document.Document toLuceneDocument(Document doc) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("id", doc.getDocId()));
    entry.add(new TextField("title", doc.getTitle(), Field.Store.NO));
    entry.add(new TextField("text", doc.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java

License:Apache License

public static org.apache.lucene.document.Document createLuceneDocument(Passage passage) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("hash", TypeUtil.hash(passage)));
    entry.add(new TextField("text", passage.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java

License:Apache License

public static org.apache.lucene.document.Document createLuceneSectionDocument(Passage passage) {
    org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
    entry.add(new StoredField("hash", TypeUtil.hash(passage)));
    entry.add(new TextField(passage.getBeginSection(), passage.getText(), Field.Store.NO));
    return entry;
}

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {//  w w  w.  ja v a 2 s  .c  om
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:Example.lucene.TestIndexer.java

private static void addDoc(IndexWriter w, int id, String url, String title, String content) throws IOException {
    Document doc = new Document();
    doc.add(new StoredField("id", id));
    doc.add(new TextField("url", url, Field.Store.YES));
    doc.add(new TextField("title", title, Field.Store.NO));
    // use a string field for isbn because we don't want it tokenized
    doc.add(new TextField("content", content, Field.Store.NO));

    //doc.add(new org.);
    w.addDocument(doc);/*from  w  ww  . ja v a 2 s . c  om*/
}

From source file:fr.paris.lutece.plugins.directory.service.search.DirectorySearchIndexer.java

License:Open Source License

/**
 * Builds a document which will be used by Lucene during the indexing of
 * this record//from ww w. j  ava  2s .  c  o  m
 * @param record the record to convert into a document
 * @param listContentEntry the entries in this record that are marked as
 *            is_indexed
 * @param listTitleEntry the entries in this record that are marked as
 *            is_indexed_as_title
 * @param listSummaryEntry the entries in this record that are marked as
 *            is_indexed_as_summary
 * @param plugin the plugin object
 * @return a lucene document filled with the record data
 */
public Document getDocument(Record record, List<IEntry> listContentEntry, List<IEntry> listTitleEntry,
        List<IEntry> listSummaryEntry, Plugin plugin) {
    Document doc = new Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    FieldType ftNotStored = new FieldType(StringField.TYPE_NOT_STORED);
    ftNotStored.setOmitNorms(false);
    ftNotStored.setTokenized(false);

    boolean bFallback = false;

    //Fallback if there is no entry marker as indexed_as_title
    //Uses the first indexed field instead
    if (listTitleEntry.isEmpty() && !listContentEntry.isEmpty()) {
        listTitleEntry.add(listContentEntry.get(0));
        bFallback = true;
    }

    String strTitle = getContentToIndex(record, listTitleEntry, plugin);

    //Fallback if fields were empty
    //Uses the first indexed field instead
    if (StringUtils.isBlank(strTitle) && !bFallback && !listContentEntry.isEmpty()) {
        listTitleEntry.clear();
        listTitleEntry.add(listContentEntry.get(0));
        strTitle = getContentToIndex(record, listTitleEntry, plugin);
    }

    //No more fallback. Giving up
    if (StringUtils.isBlank(strTitle)) {
        return null;
    }

    doc.add(new Field(SearchItem.FIELD_TITLE, strTitle, ft));

    if (!listContentEntry.isEmpty()) {
        String strContent = getContentToIndex(record, listContentEntry, plugin);

        if (StringUtils.isNotBlank(strContent)) {
            doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));
        }
    }

    if (!listSummaryEntry.isEmpty()) {
        String strSummary = getContentToIndex(record, listSummaryEntry, plugin);

        if (StringUtils.isNotBlank(strSummary)) {
            doc.add(new StoredField(SearchItem.FIELD_SUMMARY, strSummary));
        }
    }

    String strRoleKey = record.getRoleKey();

    if (StringUtils.isBlank(strRoleKey)) {
        strRoleKey = ROLE_NONE;
    }

    doc.add(new Field(SearchItem.FIELD_ROLE, strRoleKey, ft));

    String strDate = DateTools.dateToString(record.getDateCreation(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    String strDateModification = DateTools.dateToString(record.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDateModification, ft));

    doc.add(new Field(SearchItem.FIELD_TYPE, DIRECTORY, ft));

    UrlItem url = new UrlItem(AppPathService.getPortalUrl());
    url.addParameter(XPageAppService.PARAM_XPAGE_APP, DIRECTORY);
    url.addParameter(PARAMETER_ID_DIRECTORY_RECORD, record.getIdRecord());
    url.addParameter(PARAMETER_VIEW_DIRECTORY_RECORD, "");
    doc.add(new Field(SearchItem.FIELD_URL, url.getUrl(), ft));

    //Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with question/answer, it is indexed, but it is not
    // tokenized prior to indexing.
    String strUID = Integer.toString(record.getIdRecord()) + "_" + SHORT_NAME;
    doc.add(new Field(SearchItem.FIELD_UID, strUID, ftNotStored));

    return doc;
}

From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java

License:Open Source License

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following//from   w  ww.ja v  a2  s  .c  o  m
 * fields : summary, uid, url, contents, title and description.
 *
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole,
        String strPortletDocumentId) throws IOException, InterruptedException {
    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));

    // Add the PortletDocumentId as a field named "document_portlet_id".  
    doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdDocument = String.valueOf(document.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    String strContent = handler.toString();

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    FieldType ft2 = new FieldType(TextField.TYPE_STORED);
    ft2.setOmitNorms(true);
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2));

    doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // add metadata (mapped to summary)
    doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED));
    doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary()));

    // return the document
    return doc;
}

From source file:imgProc.SiftDocumentBuilder.java

License:Open Source License

public Field[] createDescriptorFields(BufferedImage image) {
    Field[] result = null;//from   w w  w .  j  a va2  s .c om
    try {
        // extract features from image:
        List<Feature> features = extractor.computeSiftFeatures(image);
        result = new Field[features.size()];
        int count = 0;
        // create new document:
        for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) {
            Feature f = fit.next();
            result[count] = new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getStringRepresentation());
            System.out.println("Scale:" + f.scale);

            count++;

        }
    } catch (IOException e) {
        //logger.severe(e.getMessage());
    }
    return result;
}

From source file:imgProc.SiftDocumentBuilder.java

License:Open Source License

public Document createDocument(BufferedImage image, String identifier) {
    Document doc = null;/*  w ww  .  j av a  2s  .  c  o  m*/
    try {
        // extract features from image:
        List<Feature> features = extractor.computeSiftFeatures(image);
        // create new document:
        doc = new Document();
        for (Iterator<Feature> fit = features.iterator(); fit.hasNext();) {
            Feature f = fit.next();
            // add each feature to the document:
            doc.add(new StoredField(DocumentBuilder.FIELD_NAME_SIFT, f.getByteArrayRepresentation()));
        }
        if (identifier != null)
            doc.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES));
    } catch (IOException e) {
        // logger.severe(e.getMessage());
    }
    return doc;
}

From source file:index.IndexDirectoryBuilder.java

License:Apache License

/**
 * Builds a Lucene document to be added to the index based on a
 * specified name for the location and the corresponding
 * {@link GeoName} object.//from   w w w .  ja v a  2s  .  c  om
 * 
 * @param name          name to serve as index key
 * @param geonameEntry  string from GeoNames gazetteer
 * @param geonameID     unique identifier (for quick look-up)
 * @param population    number of inhabitants (used for scoring)
 * @return              document to be added to the index
 */
public static Document buildDoc(String name, String geonameEntry, int geonameID, Long population) {
    // in case you're wondering, yes, this is a non-standard use of
    // the Lucene Document construct :)
    Document doc = new Document();

    // this is essentially the key we'll try to match location
    // names against
    doc.add(new TextField("indexName", name, Field.Store.YES));

    // this is the payload we'll return when matching location
    // names to gazetteer records
    doc.add(new StoredField("geoname", geonameEntry));

    // TODO: use geonameID to link administrative subdivisions to
    //       each other
    doc.add(new IntField("geonameID", geonameID, Field.Store.YES));

    // we'll initially sort match results based on population
    doc.add(new LongField("population", population, Field.Store.YES));

    logger.debug("Adding to index: " + name);

    return doc;
}