Example usage for org.apache.lucene.document Field Field

List of usage examples for org.apache.lucene.document Field Field

Introduction

In this page you can find the example usage for org.apache.lucene.document Field Field.

Prototype

public Field(String name, byte[] value, int offset, int length, IndexableFieldType type) 

Source Link

Document

Create field with binary value.

Usage

From source file:aos.lucene.tools.FastVectorHighlighterSample.java

License:Apache License

static void makeIndex() throws IOException {
    IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.UNLIMITED);
    for (String d : DOCS) {
        Document doc = new Document();
        doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
        writer.addDocument(doc);/*from ww  w .  j av a2 s  .c om*/
    }
    writer.close();
}

From source file:au.edu.unimelb.csse.analyser.NodeCacheTest.java

License:Apache License

public void testReusesNodesWhileIndexing() throws Exception {
    String[] sents = new String[] { "(A(B C)(D(E F)))", "(A(B(C D)))", "(A(B C)(D(E(F(G H)))))", "(A(B C))" };
    String[] jsonSents = new String[sents.length];
    String2NodesParser parser = new String2NodesParser();
    assertEquals(0, NodeCache.cacheSize());
    int[] expectedCounts = new int[] { 0, 2, 0, 5 };
    //First sent: 6 nodes are used but they are not returned until the next sentence is read. 
    //Hence the cache still returns a size of 0
    //Second sent: 6 nodes are returned back but the new sentence contains 4 nodes
    //6 - 4 = 2/*from  w w  w .ja  v a 2s.  co m*/
    //Third sent: 4 nodes are returned back but the new sentence contains 8 nodes
    //size shows 0 again
    //Fourth sent: 8 nodes are returned back but the new sentence contains 3 nodes
    //8 - 3 = 5

    for (int i = 0; i < sents.length; i++) {
        jsonSents[i] = parser.parse(sents[i]).asJSONString();
        assertEquals(expectedCounts[i], NodeCache.cacheSize());
    }
    Analyzer analyser = new NodeTreebankAnalyser(false);
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", jsonSents[0], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[1], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[2], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[3], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

}

From source file:au.edu.unimelb.csse.CreateIndex.java

License:Apache License

void processTreebankFile(File file) throws IOException {
    if (gzip) {//  ww w.  j  a  va2  s.  com
        if (!file.getName().endsWith(".gz"))
            return;
    } else {
        if (!file.getName().endsWith(".mrg"))
            return;
    }
    BufferedReader reader = new BufferedReader(getInputStreamReader(file));
    System.out.println("[" + DateFormat.getInstance().format(new Date()) + "] Indexing file " + file.getName());

    if (tokenizer == null) {
        tokenizer = new SentenceTokenizer(reader);
    } else {
        tokenizer.reset(reader);
    }
    SentenceAndMetaData next = tokenizer.next();
    while (next != null) {
        String sentence = next.sentence();
        Node root;
        try {
            root = parser.parse(sentence);
        } catch (ParseException e1) {
            e1.printStackTrace();
            next = tokenizer.next();
            continue;
        }
        String asJson = root.asJSONString();
        Document d = new Document();
        d.add(new Field("sent", asJson, Field.Store.COMPRESS, Field.Index.ANALYZED_NO_NORMS,
                Field.TermVector.WITH_POSITIONS));
        d.add(new Field("docnum", file.getName() + "." + next.lineOffset(), Field.Store.YES, Field.Index.NO,
                Field.TermVector.NO));
        // String id = "n=" + fname + "&l=" + next.lineOffset() + "&nol"
        // + next.numberOfLines();
        try {
            writer.addDocument(d);
            sentencesProcessed++;
        } catch (OverflowException e) {
            // System.err.println("cannot index sentence " + id);
            // logger.info(e.getMessage());
            next = tokenizer.next();
            continue;
        } catch (Exception e) {
            // System.err.println("error while indexing sentence " + id);
            System.err.println(e.getMessage());
            logger.warning(e.getMessage());
        }
        if (sentencesProcessed == numSents) {
            maxReached = true;
            break;
        }
        next = tokenizer.next();
    }
    reader.close();
}

From source file:au.edu.unimelb.csse.CreateTextIndex.java

License:Apache License

private void processTreebankFile(File file) throws IOException {
    if (isGzip) {
        if (!file.getName().endsWith(".gz"))
            return;
    } else {//from ww  w . j  a  va 2s.  c o m
        if (!file.getName().endsWith(".mrg"))
            return;
    }
    BufferedReader reader = new BufferedReader(getInputStreamReader(file));
    System.out.println("Processing treebank file: " + file.getName());
    if (tokenizer == null) {
        tokenizer = new SentenceTokenizer(reader);
    } else {
        tokenizer.reset(reader);
    }
    SentenceAndMetaData next = tokenizer.next();
    while (next != null) {
        String sentence = next.sentence();
        Document d = new Document();
        d.add(new Field("sent", sentence, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
                Field.TermVector.WITH_POSITIONS));
        try {
            writer.addDocument(d);
            sentencesProcessed++;
            if (sentencesProcessed % 500000 == 0) {
                System.out.println("Finished indexing " + sentencesProcessed + " sentences.");
                System.out.println(
                        "Time from start: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds.");
            }
        } catch (OverflowException e) {
            next = tokenizer.next();
            continue;
        } catch (Exception e) {
            System.err.println(e.getMessage());
        }
        if (sentencesProcessed == numSents) {
            maxReached = true;
            break;
        }
        next = tokenizer.next();
    }
    reader.close();
}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

/**
 * This test is actually commented out.. to run the test.. match counting has to be enabled in JoinLogic
 * @throws Exception//from  ww w.j a v  a  2s .  c o  m
 */
public void testNumberOfCallsToMatch() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);

    writer.close();

    IndexSearcher searcher = new IndexSearcher(dir);
    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, false, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, false, 2);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE, false, 23);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 8);

}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

public void testFilterjoin() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);/* www.j a  va 2  s.  com*/

    writer.close();

    IndexSearcher searcher = new IndexSearcher(dir);

    boolean[] lookaheadOptions = new boolean[] { false, true };
    for (TermJoinType type : TermJoinType.values()) {
        for (boolean lookahead : lookaheadOptions) {
            QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP]");
            TreebankQuery query = builder.parse(type, lookahead);
            SimpleHitCollector hitCollector = new SimpleHitCollector(10);
            searcher.search(query, hitCollector);
            assertEquals(1, hitCollector.totalHits);
        }
    }

    QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP/JJ/rugby]");
    TreebankQuery query = builder.parse(TermJoinType.SIMPLE, true);
    SimpleHitCollector hitCollector = new SimpleHitCollector(10);
    searcher.search(query, hitCollector);
    assertEquals(1, hitCollector.totalHits);

}

From source file:ccc.plugins.search.lucene.SimpleLuceneFS.java

License:Open Source License

private void createDocument(final UUID id, final ResourcePath path, final ResourceName name, final String title,
        final Set<String> tags, final String content, final Set<Paragraph> paragraphs, final ResourceType type,
        final Date dateCreated, final Date dateChanged, final Collection<ACL> acl) {
    try {//from  w  w w.  jav  a 2 s  .  c om
        clearDocuments(id);

        final Document d = new Document();

        if (paragraphs != null) {
            for (final Paragraph paragraph : paragraphs) {
                indexParagraph(d, paragraph);
            }
        }

        final byte[] s11nAcl = AclFilter.serialise(acl);
        d.add(new Field(ACL_FIELD, s11nAcl, 0, s11nAcl.length, Field.Store.YES));
        d.add(new Field(DEFAULT_FIELD, content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        d.add(new Field("id", id.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        d.add(new Field("path", "/content" + path.toString().toLowerCase(_locale), Field.Store.NO,
                Field.Index.NOT_ANALYZED));
        d.add(new Field("name", name.toString().toLowerCase(_locale), Field.Store.NO,
                Field.Index.NOT_ANALYZED));
        addEnum(d, "type", type);

        addStringField(d, "title", title);
        addTagsField(d, "tags", tags);
        addDateField(d, "date_created", dateCreated);
        addDateField(d, "date_changed", dateChanged);

        _writer.addDocument(d);
        LOG.debug("Added document.");

    } catch (final IOException e) {
        LOG.warn("Error adding document.", e);
    }
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelDocument.java

License:Open Source License

public static Document Document(String label) {
    Document doc = new Document();
    Field fLabel = new Field(FIELD_LABEL, label, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.YES);
    fLabel.setOmitTermFreqAndPositions(true);
    doc.add(fLabel);//from  w w w  . j  av  a 2 s .  co m
    return doc;
}

From source file:com.appspot.socialinquirer.server.service.impl.AnalysisServiceImpl.java

License:Apache License

@Override
public List<Tag> getTermVector(String title, String text) {
    RAMDirectory directory = null;/*w w  w.j  a va  2  s  .  c om*/
    IndexReader reader = null;
    Map<String, Tag> tagsMap = new HashMap<String, Tag>();

    try {
        directory = new RAMDirectory();

        IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true,
                MaxFieldLength.UNLIMITED);
        Document doc = new Document();

        doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
        doc.add(new Field("body", stripHtmlTags(text, true), Field.Store.YES, Field.Index.ANALYZED,
                Field.TermVector.YES));
        writer.addDocument(doc);

        writer.close();
        reader = IndexReader.open(directory, true);
        int numDocs = reader.maxDoc();
        for (int i = 0; i < numDocs; i++) {
            TermFreqVector termFreqVector = reader.getTermFreqVector(i, "title");
            pullTags(termFreqVector, tagsMap);
            termFreqVector = reader.getTermFreqVector(i, "body");
            pullTags(termFreqVector, tagsMap);
        }

    } catch (Exception e) {
        logger.log(Level.SEVERE, "An error occured while pulling tags from text.", e);
    } finally {
        closeIndexReader(reader);
        closeRAMDirectory(directory);
    }
    ArrayList<Tag> tagsList = new ArrayList<Tag>(tagsMap.values());
    Collections.sort(tagsList, new Comparator<Tag>() {
        @Override
        public int compare(Tag o1, Tag o2) {
            return o2.getFreqency() - o1.getFreqency();
        }
    });

    return tagsList;
}

From source file:com.bluecubs.xinco.index.XincoDocument.java

License:Apache License

public static Document getXincoDocument(XincoCoreData d, boolean index_content, XincoDBManager dbm)
        throws java.io.FileNotFoundException {

    int i, j, l;/* w ww. j  av a  2s  .co  m*/
    int i2, j2;
    short k, k2;
    FileInputStream is = null;
    Document doc = null;
    Document temp_doc = null;
    int file_type = 0;
    int file_ext_index = 0;
    String file_ext = "";

    doc = new Document();

    //add XincoCoreData information
    doc.add(new Field("id", (new Integer(d.getId())).toString(), true, true, false));
    doc.add(Field.Text("designation", d.getDesignation()));
    doc.add(new Field("language", (new Integer(d.getXinco_core_language().getId())).toString(), true, true,
            false));

    //add content of file
    if (index_content) {
        if ((d.getXinco_core_data_type().getId() == 1) && (d.getStatus_number() != 3)) { //process non-archived file
            //extract file extension from file name
            file_ext_index = ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0)).getAttrib_varchar()
                    .lastIndexOf(".");
            if (file_ext_index == -1) {
                file_ext = "";
            } else {
                if (file_ext_index >= ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0))
                        .getAttrib_varchar().length() - 1) {
                    file_ext = "";
                } else {
                    file_ext = ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0))
                            .getAttrib_varchar().substring(file_ext_index + 1);
                }
            }
            //check which indexer to use for file extension
            file_type = 0; // default: index as TEXT
            for (l = 0; l < dbm.config.FileIndexerCount; l++) {
                for (i = 0; i < ((String[]) dbm.config.IndexFileTypesExt.elementAt(l)).length; i++) {
                    if (((String[]) dbm.config.IndexFileTypesExt.elementAt(l))[i].compareTo(file_ext) == 0) {
                        file_type = l + 1; // file-type specific indexing
                        break;
                    }
                }
                if (file_type > 0) {
                    break;
                }
            }
            if (file_type == 0) {
                for (i = 0; i < dbm.config.IndexNoIndex.length; i++) {
                    if (dbm.config.IndexNoIndex[i].compareTo(file_ext) == 0) {
                        file_type = -1; // NO indexing
                        break;
                    }
                }
            }
            // call actual indexing classes
            XincoIndexFileType xift = null;
            Reader ContentReader = null;
            String ContentString = null;
            if (file_type == 0) {
                // index as TEXT
                xift = new XincoIndexText();
                doc.add(Field.Text("file", xift.getFileContentReader(
                        new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString()))));
            } else if (file_type > 0) {
                // file-type specific indexing
                try {
                    xift = (XincoIndexFileType) Class
                            .forName((String) dbm.config.IndexFileTypesClass.elementAt(file_type - 1))
                            .newInstance();
                    ContentReader = xift.getFileContentReader(
                            new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString()));
                    if (ContentReader != null) {
                        doc.add(Field.Text("file", ContentReader));
                    } else {
                        ContentString = xift.getFileContentString(
                                new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString()));
                        if (ContentString != null) {
                            doc.add(Field.Text("file", ContentString));
                        }
                    }
                } catch (Exception ie) {
                }
            }

        }
    }

    //add attributes
    for (i = 0; i < d.getXinco_add_attributes().size(); i++) {
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("int") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_int()));
        }
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("unsignedint") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i))
                            .getAttrib_unsignedint()));
        }
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("double") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_double()));
        }
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("varchar") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_varchar()));
        }
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("text") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_text()));
        }
        if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes()
                .elementAt(i)).getData_type().toLowerCase().compareTo("datetime") == 0) {
            doc.add(Field.Text(
                    ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type()
                            .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(),
                    "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_datetime()));
        }
    }

    return doc;
}