List of usage examples for org.apache.lucene.document Document getField
public final IndexableField getField(String name)
From source file:net.semanticmetadata.lire.searchers.GenericFastImageSearcher.java
License:Open Source License
public ImageSearchHits search(Document doc, IndexReader reader) throws IOException { SimpleImageSearchHits searchHits = null; // try { LireFeature lireFeature = extractorItem.getFeatureInstance(); if (doc.getField(fieldName).binaryValue() != null && doc.getField(fieldName).binaryValue().length > 0) lireFeature.setByteArrayRepresentation(doc.getField(fieldName).binaryValue().bytes, doc.getField(fieldName).binaryValue().offset, doc.getField(fieldName).binaryValue().length); double maxDistance = findSimilar(reader, lireFeature); if (!useSimilarityScore) { searchHits = new SimpleImageSearchHits(this.docs, maxDistance); } else {//from w w w . j a v a 2 s . c o m searchHits = new SimpleImageSearchHits(this.docs, maxDistance, useSimilarityScore); } // } catch (InstantiationException e) { // logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage()); // } catch (IllegalAccessException e) { // logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage()); // } return searchHits; }
From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java
License:Open Source License
/** * It converts a index file of Lucene to a weka file for classification. The * weka file class are nominal. The classifiers will work with nominal * class.//from w ww. ja v a2s .c om * * * @param wekaFileName Path of weka file. * @param indexFile Path of index file based on Lucene. The document indexes * must have fields called "class" and "content". WARNING: The fields must * not contains any puntuaction sign. * * @return Instances of weka. The instances are sparse since it is about * text information. * * @throws FileNotFoundException If the file does not exists. * @throws IOException If happens a error while writing the file. */ public Instances convertLuceneToWekaClassification(String wekaFileName, String indexFile) throws FileNotFoundException, IOException { File nuevo = new File(wekaFileName); if (!verify(nuevo)) { return null; } FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine); IndexSearcher searcher = new IndexSearcher(indexFile); IndexReader reader = searcher.getIndexReader(); int total = reader.maxDoc(); HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2); Set<String> labels = new HashSet<String>(total * 2); int i; for (int l = 0; l < total; l++) { if (!reader.isDeleted(l)) { TermFreqVector vector = reader.getTermFreqVector(l, content); Document doc = reader.document(l); String current = doc.getField(classF).stringValue(); labels.add(current); if (vector != null) { String listosI[] = vector.getTerms(); for (i = 0; i < listosI.length; i++) { if (!terms.containsKey(listosI[i])) { terms.put(listosI[i], terms.size()); } } } } } String[] labelReady = new String[labels.size()]; int posLabel = 0; for (String string : labels) { labelReady[posLabel] = string; posLabel++; } Container[] terminos = convertir(terms); Arrays.sort(terminos); for (int j = 0; j < terminos.length; j++) { FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n"); } FileUtil.writeFile(nuevo, "@ATTRIBUTE class {"); for (int j = 0; j < labelReady.length - 1; j++) { FileUtil.writeFile(nuevo, labelReady[j] + ","); } FileUtil.writeFile(nuevo, labelReady[labelReady.length - 1] + "}" + doubleLine); FileUtil.writeFile(nuevo, "@DATA\n"); for (int pos = 0; pos < searcher.maxDoc(); pos++) { if (!reader.isDeleted(pos)) { TermFreqVector vector = reader.getTermFreqVector(pos, content); if (vector != null) { int[] origen = vector.getTermFrequencies(); String[] termsI = vector.getTerms(); int[] positions = new int[origen.length]; for (int k = 0; k < origen.length; k++) { positions[k] = terms.get(termsI[k]); } Container[] escribir = convertir(positions, origen); Arrays.sort(escribir); FileUtil.writeFile(nuevo, "{"); for (int j = 0; j < escribir.length; j++) { FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ","); } FileUtil.writeFile(nuevo, terms.size() + " " + searcher.doc(pos).getField(classF).stringValue() + "}\n"); } } } //close files closeReaders(searcher, reader); //Test if the weka file works Instances test = testWekaFile(wekaFileName); return test; }
From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java
License:Open Source License
/** * It converts a index file of Lucene to a weka file for regression. The * weka file class are real. The used classifiers will work with numeric * real classe.// www. j av a 2s . c om * * @param wekaFileName Path of weka file. * @param indexFile Path of index file based on Lucene. The document indexes * must have fields called "class" and "content". WARNING: The fields must * not contains any puntuaction sign. * * @return Instances of weka. The instances are sparse since it is about * text information. * * @throws FileNotFoundException If the file does not exists. * @throws IOException If happens a error while writing the file. */ public Instances convertLuceneToWekaRegression(String wekaFileName, String indexFile) throws FileNotFoundException, IOException { File nuevo = new File(wekaFileName); if (!verify(nuevo)) { return null; } FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine); IndexSearcher searcher = new IndexSearcher(indexFile); IndexReader reader = searcher.getIndexReader(); int total = reader.maxDoc(); HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2); HashMap<String, Integer> labels = new HashMap<String, Integer>(total * 2); int i; for (int l = 0; l < total; l++) { if (!reader.isDeleted(l)) { TermFreqVector vector = reader.getTermFreqVector(l, content); Document doc = reader.document(l); String current = doc.getField(classF).stringValue(); if (!labels.containsKey(current)) { labels.put(current, labels.size()); } if (vector != null) { String listosI[] = vector.getTerms(); for (i = 0; i < listosI.length; i++) { if (!terms.containsKey(listosI[i])) { terms.put(listosI[i], terms.size()); } } } } } Container[] terminos = convertir(terms); Arrays.sort(terminos); for (int j = 0; j < terminos.length; j++) { FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n"); } FileUtil.writeFile(nuevo, "@ATTRIBUTE class REAL [0.0,"); FileUtil.writeFile(nuevo, (labels.size() - 1) + ".0]" + doubleLine); FileUtil.writeFile(nuevo, "@DATA\n"); for (int pos = 0; pos < searcher.maxDoc(); pos++) { if (!reader.isDeleted(pos)) { TermFreqVector vector = reader.getTermFreqVector(pos, content); if (vector != null) { int[] origen = vector.getTermFrequencies(); String[] termsI = vector.getTerms(); int[] positions = new int[origen.length]; for (int k = 0; k < origen.length; k++) { positions[k] = terms.get(termsI[k]); } Container[] escribir = convertir(positions, origen); Arrays.sort(escribir); FileUtil.writeFile(nuevo, "{"); for (int j = 0; j < escribir.length; j++) { FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ","); } FileUtil.writeFile(nuevo, terms.size() + " " + labels.get(searcher.doc(pos).getField(classF).stringValue()) + ".0}\n"); } } } //close files closeReaders(searcher, reader); //Test if the weka file works Instances test = testWekaFile(wekaFileName); return test; }
From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); IQuranLocation loc = new QuranLocation(doc.getField("location").stringValue()); if (searchScope.includes(loc)) { bits.set(i);//from www . ja v a2 s. com } } return new DocIdBitSet(bits); }
From source file:net.skyatlas.icd.test.RAMDirectoryDemo.java
public static void main(String[] args) throws IOException { long startTime = System.currentTimeMillis(); System.err.println("*************************** ****************************"); RAMDirectory directory = new RAMDirectory(); Version matchVersion = Version.LUCENE_48; Analyzer analyzer = new StandardAnalyzer(matchVersion); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("name", "Chenghui", Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("sex", "", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("dosometing", "I am learning lucene ", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);// w ww .ja v a2 s . c o m writer.close(); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Query query = new TermQuery(new Term("dosometing", "lucene")); TopDocs rs = searcher.search(query, null, 10); long endTime = System.currentTimeMillis(); System.out.println( "" + (endTime - startTime) + "" + rs.totalHits + "?"); for (int i = 0; i < rs.scoreDocs.length; i++) { // rs.scoreDocs[i].doc ??id, 0 Document firstHit = searcher.doc(rs.scoreDocs[i].doc); System.out.println("name:" + firstHit.getField("name").stringValue()); System.out.println("sex:" + firstHit.getField("sex").stringValue()); System.out.println("dosomething:" + firstHit.getField("dosometing").stringValue()); } reader.close(); directory.close(); System.out.println("*****************?**********************"); }
From source file:net.skyatlas.icd.util.MemoryGrid.java
public void searchDiagName(String diagName) throws IOException { IndexReader reader = DirectoryReader.open(this.directory); IndexSearcher searcher = new IndexSearcher(reader); Query query = new TermQuery(new Term("pathDesc", diagName)); TopDocs ts = searcher.search(query, null, 100); long endTime = System.currentTimeMillis(); for (int i = 0; i < ts.scoreDocs.length; i++) { // rs.scoreDocs[i].doc ??id, 0 Document firstHit = searcher.doc(ts.scoreDocs[i].doc); System.out.println("guid:" + firstHit.getField("guid").stringValue()); // System.out.println("sex:" + firstHit.getField("sex").stringValue()); System.out.println("pathDesc:" + firstHit.getField("pathDesc").stringValue()); }//from ww w .j a v a 2 s.c o m reader.close(); directory.close(); System.out.println("*****************?**********************"); }
From source file:net.sourceforge.subsonic.service.LuceneSearchService.java
License:Open Source License
public SearchResult search(SearchCriteria criteria, IndexType indexType) { SearchResult result = new SearchResult(); List<MusicFile> musicFiles = new ArrayList<MusicFile>(); int offset = criteria.getOffset(); int count = criteria.getCount(); result.setOffset(offset);/*from w ww. j a v a2s. co m*/ result.setMusicFiles(musicFiles); IndexReader reader = null; try { reader = createIndexReader(indexType); Searcher searcher = new IndexSearcher(reader); Analyzer analyzer = new SubsonicAnalyzer(); MultiFieldQueryParser queryParser = new MultiFieldQueryParser(LUCENE_VERSION, indexType.getFields(), analyzer, indexType.getBoosts()); Query query = queryParser.parse(criteria.getQuery()); TopDocs topDocs = searcher.search(query, null, offset + count); result.setTotalHits(topDocs.totalHits); int start = Math.min(offset, topDocs.totalHits); int end = Math.min(start + count, topDocs.totalHits); for (int i = start; i < end; i++) { Document doc = searcher.doc(topDocs.scoreDocs[i].doc); musicFiles.add(musicFileService.getMusicFile(doc.getField(FIELD_PATH).stringValue())); } } catch (Throwable x) { LOG.error("Failed to execute Lucene search.", x); } finally { FileUtil.closeQuietly(reader); } return result; }
From source file:net.strong.weblucene.index.SAXIndexer.java
License:Apache License
/** * map original document to lucene index Field * * @param origDocument original lucene Document * * @return Document: parse original and make index fields *///from www .j a va2 s . c o m private Document mapDoc(Document origDocument) { //new Lucene Document Document newDoc = new Document(); try { //Enumeration fieldEnum = origDocument.fields(); List fieldEnum = origDocument.getFields(); //while (fieldEnum.hasMoreElements()) { while (fieldEnum != null && fieldEnum.size() > 0) { //Lucene Document Field //Field fld = (Field) fieldEnum.nextElement(); Field fld = (Field) fieldEnum.remove(0); //index map field with fields name list: 'field1,field2,field5....' if (fld.isIndexed()) { String indexName = fld.name(); StringBuffer indexValue = new StringBuffer(); //split field list with "," String fieldList = fld.stringValue(); StringTokenizer st = new StringTokenizer(fieldList, ","); while (st.hasMoreTokens()) { //add indexValue with mapped field value String mapFieldName = new String(); mapFieldName = st.nextToken(); Field mapField = origDocument.getField(mapFieldName); String mapValue = null; if (mapField != null) { mapValue = mapField.stringValue(); } //add text field value to indexing field if (mapValue != null) { indexValue.append(mapValue); //add space between fields avoid "field1field2" indexValue.append(" "); } } if (indexValue.length() > 0) { Field newIndex = new Field(indexName, indexValue.toString(), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES); newDoc.add(newIndex); } } else { //add a common field newDoc.add(fld); } } } catch (Exception e) { logger.error(e.toString()); return null; } return newDoc; }
From source file:NewsIR_search.Indexer.java
void indexFile(File collFile) throws Exception { Document doc;/* w ww . jav a 2 s. c o m*/ String docType = prop.getProperty("docType"); if (docType.equalsIgnoreCase("trec")) { try { TrecDocIterator docElts = new TrecDocIterator(collFile); Document docElt; while (docElts.hasNext()) { docElt = docElts.next(); if (docElt == null) { System.out.println("docElt null"); break; } String tmp; int i = 0; tmp = prop.getProperty("index_fields", "null"); if (!tmp.contentEquals("null")) { for (i = 0; i < tmp.split(",").length; i++) { // System.out.println(index_fields[i]+" ---> "+docElt.getField(index_fields[i]).stringValue()); INDEX_FIELDS[i] = docElt.getField(index_fields[i]).stringValue(); } } String DOCNOElt = docElt.getField("DOCNO").stringValue(); FIELD_ID = DOCNOElt; String TEXTElt = docElt.getField("TEXT").stringValue(); doc = constructDoc(TEXTElt); indexWriter.addDocument(doc); System.out.println(DOCNOElt); docIndexedCounter++; } } catch (FileNotFoundException ex) { System.err.println("Error: '" + collFile.getAbsolutePath() + "' not found"); ex.printStackTrace(); } catch (IOException ex) { System.err.println("Error: IOException on reading '" + collFile.getAbsolutePath() + "'"); ex.printStackTrace(); } } }
From source file:nl.knaw.huygens.timbuctoo.lucene.demoTwo.SearchFiles.java
License:Apache License
/** * This demonstrates a typical paging search scenario, where the search * engine presents pages of size n to the user. The user can then go to the * next page if interested in the next hits. * /*from ww w.j ava 2 s. co m*/ * When the query is executed for the first time, then only enough results * are collected to fill 5 result pages. If the user wants to page beyond * this limit, then the query is executed another time and all hits are * collected. * */ public static void doPagingSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage, boolean raw, boolean interactive) throws IOException { // Collect enough docs to show 5 pages TopDocs results = searcher.search(query, 5 * hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, hitsPerPage); while (true) { if (end > hits.length) { System.out.println("Only results 1 - " + hits.length + " of " + numTotalHits + " total matching documents collected."); System.out.println("Collect more (y/n) ?"); String line = in.readLine(); if (line.length() == 0 || line.charAt(0) == 'n') { break; } hits = searcher.search(query, numTotalHits).scoreDocs; } end = Math.min(hits.length, start + hitsPerPage); for (int i = start; i < end; i++) { if (raw) { // output raw format System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score); continue; } Document doc = searcher.doc(hits[i].doc); // System.out.println(doc.getField("content")); // IndexableField content = doc.getField("content"); // System.out.println(content.stringValue()); // String[] values = doc.getValues("indexed"); // for (int j = 0; j < values.length; j++) { // System.out.println(j + " : " + values[j]); // } String path = doc.get("path"); if (path != null) { System.out.println((i + 1) + ". " + path); System.out.println(doc.get("author")); System.out.println(doc.get("title")); System.out.println(doc.get("date")); System.out.println(doc.get("content").substring(0, 60)); // for (IndexableField field : doc.getFields()) { // System.out.println("field: " + field); // } if (doc.getField("indexed") != null) { System.out.println(" indexed: " + doc.getField("indexed")); } } else { System.out.println((i + 1) + ". " + "No path for this document"); } System.out.flush(); } if (!interactive || end == 0) { break; } if (numTotalHits >= end) { boolean quit = false; while (true) { System.out.print("Press "); if (start - hitsPerPage >= 0) { System.out.print("(p)revious page, "); } if (start + hitsPerPage < numTotalHits) { System.out.print("(n)ext page, "); } System.out.println("(q)uit or enter number to jump to a page."); String line = in.readLine(); if (line.length() == 0 || line.charAt(0) == 'q') { quit = true; break; } if (line.charAt(0) == 'p') { start = Math.max(0, start - hitsPerPage); break; } else if (line.charAt(0) == 'n') { if (start + hitsPerPage < numTotalHits) { start += hitsPerPage; } break; } else { int page = Integer.parseInt(line); if ((page - 1) * hitsPerPage < numTotalHits) { start = (page - 1) * hitsPerPage; break; } else { System.out.println("No such page"); } } } if (quit) break; end = Math.min(numTotalHits, start + hitsPerPage); } } }