List of usage examples for org.apache.lucene.document Field Field
public Field(String name, byte[] value, int offset, int length, IndexableFieldType type)
From source file:aos.lucene.tools.FastVectorHighlighterSample.java
License:Apache License
static void makeIndex() throws IOException { IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.UNLIMITED); for (String d : DOCS) { Document doc = new Document(); doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc);/*from ww w . j av a2 s .c om*/ } writer.close(); }
From source file:au.edu.unimelb.csse.analyser.NodeCacheTest.java
License:Apache License
public void testReusesNodesWhileIndexing() throws Exception { String[] sents = new String[] { "(A(B C)(D(E F)))", "(A(B(C D)))", "(A(B C)(D(E(F(G H)))))", "(A(B C))" }; String[] jsonSents = new String[sents.length]; String2NodesParser parser = new String2NodesParser(); assertEquals(0, NodeCache.cacheSize()); int[] expectedCounts = new int[] { 0, 2, 0, 5 }; //First sent: 6 nodes are used but they are not returned until the next sentence is read. //Hence the cache still returns a size of 0 //Second sent: 6 nodes are returned back but the new sentence contains 4 nodes //6 - 4 = 2/*from w w w .ja v a 2s. co m*/ //Third sent: 4 nodes are returned back but the new sentence contains 8 nodes //size shows 0 again //Fourth sent: 8 nodes are returned back but the new sentence contains 3 nodes //8 - 3 = 5 for (int i = 0; i < sents.length; i++) { jsonSents[i] = parser.parse(sents[i]).asJSONString(); assertEquals(expectedCounts[i], NodeCache.cacheSize()); } Analyzer analyser = new NodeTreebankAnalyser(false); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", jsonSents[0], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[1], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[2], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[3], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); }
From source file:au.edu.unimelb.csse.CreateIndex.java
License:Apache License
void processTreebankFile(File file) throws IOException { if (gzip) {// ww w. j a va2 s. com if (!file.getName().endsWith(".gz")) return; } else { if (!file.getName().endsWith(".mrg")) return; } BufferedReader reader = new BufferedReader(getInputStreamReader(file)); System.out.println("[" + DateFormat.getInstance().format(new Date()) + "] Indexing file " + file.getName()); if (tokenizer == null) { tokenizer = new SentenceTokenizer(reader); } else { tokenizer.reset(reader); } SentenceAndMetaData next = tokenizer.next(); while (next != null) { String sentence = next.sentence(); Node root; try { root = parser.parse(sentence); } catch (ParseException e1) { e1.printStackTrace(); next = tokenizer.next(); continue; } String asJson = root.asJSONString(); Document d = new Document(); d.add(new Field("sent", asJson, Field.Store.COMPRESS, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); d.add(new Field("docnum", file.getName() + "." + next.lineOffset(), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); // String id = "n=" + fname + "&l=" + next.lineOffset() + "&nol" // + next.numberOfLines(); try { writer.addDocument(d); sentencesProcessed++; } catch (OverflowException e) { // System.err.println("cannot index sentence " + id); // logger.info(e.getMessage()); next = tokenizer.next(); continue; } catch (Exception e) { // System.err.println("error while indexing sentence " + id); System.err.println(e.getMessage()); logger.warning(e.getMessage()); } if (sentencesProcessed == numSents) { maxReached = true; break; } next = tokenizer.next(); } reader.close(); }
From source file:au.edu.unimelb.csse.CreateTextIndex.java
License:Apache License
private void processTreebankFile(File file) throws IOException { if (isGzip) { if (!file.getName().endsWith(".gz")) return; } else {//from ww w . j a va 2s. c o m if (!file.getName().endsWith(".mrg")) return; } BufferedReader reader = new BufferedReader(getInputStreamReader(file)); System.out.println("Processing treebank file: " + file.getName()); if (tokenizer == null) { tokenizer = new SentenceTokenizer(reader); } else { tokenizer.reset(reader); } SentenceAndMetaData next = tokenizer.next(); while (next != null) { String sentence = next.sentence(); Document d = new Document(); d.add(new Field("sent", sentence, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); try { writer.addDocument(d); sentencesProcessed++; if (sentencesProcessed % 500000 == 0) { System.out.println("Finished indexing " + sentencesProcessed + " sentences."); System.out.println( "Time from start: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds."); } } catch (OverflowException e) { next = tokenizer.next(); continue; } catch (Exception e) { System.err.println(e.getMessage()); } if (sentencesProcessed == numSents) { maxReached = true; break; } next = tokenizer.next(); } reader.close(); }
From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java
License:Apache License
/** * This test is actually commented out.. to run the test.. match counting has to be enabled in JoinLogic * @throws Exception//from ww w.j a v a 2s . c o m */ public void testNumberOfCallsToMatch() throws Exception { String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)" + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")"; Analyzer analyser = new FastStringAnalyser(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); writer.close(); IndexSearcher searcher = new IndexSearcher(dir); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, false, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 1); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, false, 2); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 1); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, true, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, true, 5); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, true, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, true, 5); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE, false, 23); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 10); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP, false, 10); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 8); }
From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java
License:Apache License
public void testFilterjoin() throws Exception { String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)" + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")"; Analyzer analyser = new FastStringAnalyser(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d);/* www.j a va 2 s. com*/ writer.close(); IndexSearcher searcher = new IndexSearcher(dir); boolean[] lookaheadOptions = new boolean[] { false, true }; for (TermJoinType type : TermJoinType.values()) { for (boolean lookahead : lookaheadOptions) { QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP]"); TreebankQuery query = builder.parse(type, lookahead); SimpleHitCollector hitCollector = new SimpleHitCollector(10); searcher.search(query, hitCollector); assertEquals(1, hitCollector.totalHits); } } QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP/JJ/rugby]"); TreebankQuery query = builder.parse(TermJoinType.SIMPLE, true); SimpleHitCollector hitCollector = new SimpleHitCollector(10); searcher.search(query, hitCollector); assertEquals(1, hitCollector.totalHits); }
From source file:ccc.plugins.search.lucene.SimpleLuceneFS.java
License:Open Source License
private void createDocument(final UUID id, final ResourcePath path, final ResourceName name, final String title, final Set<String> tags, final String content, final Set<Paragraph> paragraphs, final ResourceType type, final Date dateCreated, final Date dateChanged, final Collection<ACL> acl) { try {//from w w w. jav a 2 s . c om clearDocuments(id); final Document d = new Document(); if (paragraphs != null) { for (final Paragraph paragraph : paragraphs) { indexParagraph(d, paragraph); } } final byte[] s11nAcl = AclFilter.serialise(acl); d.add(new Field(ACL_FIELD, s11nAcl, 0, s11nAcl.length, Field.Store.YES)); d.add(new Field(DEFAULT_FIELD, content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); d.add(new Field("id", id.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); d.add(new Field("path", "/content" + path.toString().toLowerCase(_locale), Field.Store.NO, Field.Index.NOT_ANALYZED)); d.add(new Field("name", name.toString().toLowerCase(_locale), Field.Store.NO, Field.Index.NOT_ANALYZED)); addEnum(d, "type", type); addStringField(d, "title", title); addTagsField(d, "tags", tags); addDateField(d, "date_created", dateCreated); addDateField(d, "date_changed", dateChanged); _writer.addDocument(d); LOG.debug("Added document."); } catch (final IOException e) { LOG.warn("Error adding document.", e); } }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelDocument.java
License:Open Source License
public static Document Document(String label) { Document doc = new Document(); Field fLabel = new Field(FIELD_LABEL, label, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.YES); fLabel.setOmitTermFreqAndPositions(true); doc.add(fLabel);//from w w w . j av a 2 s . co m return doc; }
From source file:com.appspot.socialinquirer.server.service.impl.AnalysisServiceImpl.java
License:Apache License
@Override public List<Tag> getTermVector(String title, String text) { RAMDirectory directory = null;/*w w w.j a va 2 s . c om*/ IndexReader reader = null; Map<String, Tag> tagsMap = new HashMap<String, Tag>(); try { directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true, MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field("body", stripHtmlTags(text, true), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); writer.addDocument(doc); writer.close(); reader = IndexReader.open(directory, true); int numDocs = reader.maxDoc(); for (int i = 0; i < numDocs; i++) { TermFreqVector termFreqVector = reader.getTermFreqVector(i, "title"); pullTags(termFreqVector, tagsMap); termFreqVector = reader.getTermFreqVector(i, "body"); pullTags(termFreqVector, tagsMap); } } catch (Exception e) { logger.log(Level.SEVERE, "An error occured while pulling tags from text.", e); } finally { closeIndexReader(reader); closeRAMDirectory(directory); } ArrayList<Tag> tagsList = new ArrayList<Tag>(tagsMap.values()); Collections.sort(tagsList, new Comparator<Tag>() { @Override public int compare(Tag o1, Tag o2) { return o2.getFreqency() - o1.getFreqency(); } }); return tagsList; }
From source file:com.bluecubs.xinco.index.XincoDocument.java
License:Apache License
public static Document getXincoDocument(XincoCoreData d, boolean index_content, XincoDBManager dbm) throws java.io.FileNotFoundException { int i, j, l;/* w ww. j av a 2s .co m*/ int i2, j2; short k, k2; FileInputStream is = null; Document doc = null; Document temp_doc = null; int file_type = 0; int file_ext_index = 0; String file_ext = ""; doc = new Document(); //add XincoCoreData information doc.add(new Field("id", (new Integer(d.getId())).toString(), true, true, false)); doc.add(Field.Text("designation", d.getDesignation())); doc.add(new Field("language", (new Integer(d.getXinco_core_language().getId())).toString(), true, true, false)); //add content of file if (index_content) { if ((d.getXinco_core_data_type().getId() == 1) && (d.getStatus_number() != 3)) { //process non-archived file //extract file extension from file name file_ext_index = ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0)).getAttrib_varchar() .lastIndexOf("."); if (file_ext_index == -1) { file_ext = ""; } else { if (file_ext_index >= ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0)) .getAttrib_varchar().length() - 1) { file_ext = ""; } else { file_ext = ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(0)) .getAttrib_varchar().substring(file_ext_index + 1); } } //check which indexer to use for file extension file_type = 0; // default: index as TEXT for (l = 0; l < dbm.config.FileIndexerCount; l++) { for (i = 0; i < ((String[]) dbm.config.IndexFileTypesExt.elementAt(l)).length; i++) { if (((String[]) dbm.config.IndexFileTypesExt.elementAt(l))[i].compareTo(file_ext) == 0) { file_type = l + 1; // file-type specific indexing break; } } if (file_type > 0) { break; } } if (file_type == 0) { for (i = 0; i < dbm.config.IndexNoIndex.length; i++) { if (dbm.config.IndexNoIndex[i].compareTo(file_ext) == 0) { file_type = -1; // NO indexing break; } } } // call actual indexing classes XincoIndexFileType xift = null; Reader ContentReader = null; String ContentString = null; if (file_type == 0) { // index as TEXT xift = new XincoIndexText(); doc.add(Field.Text("file", xift.getFileContentReader( new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString())))); } else if (file_type > 0) { // file-type specific indexing try { xift = (XincoIndexFileType) Class .forName((String) dbm.config.IndexFileTypesClass.elementAt(file_type - 1)) .newInstance(); ContentReader = xift.getFileContentReader( new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString())); if (ContentReader != null) { doc.add(Field.Text("file", ContentReader)); } else { ContentString = xift.getFileContentString( new File(dbm.config.FileRepositoryPath + (new Integer(d.getId())).toString())); if (ContentString != null) { doc.add(Field.Text("file", ContentString)); } } } catch (Exception ie) { } } } } //add attributes for (i = 0; i < d.getXinco_add_attributes().size(); i++) { if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("int") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_int())); } if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("unsignedint") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)) .getAttrib_unsignedint())); } if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("double") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_double())); } if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("varchar") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_varchar())); } if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("text") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_text())); } if (((XincoCoreDataTypeAttribute) d.getXinco_core_data_type().getXinco_core_data_type_attributes() .elementAt(i)).getData_type().toLowerCase().compareTo("datetime") == 0) { doc.add(Field.Text( ((XincoCoreDataTypeAttribute) d.getXinco_core_data_type() .getXinco_core_data_type_attributes().elementAt(i)).getDesignation(), "" + ((XincoAddAttribute) d.getXinco_add_attributes().elementAt(i)).getAttrib_datetime())); } } return doc; }