List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java
License:Open Source License
/** * It converts a index file of Lucene to a weka file for regression. The * weka file class are real. The used classifiers will work with numeric * real classe./* www .j ava 2 s . c o m*/ * * @param wekaFileName Path of weka file. * @param indexFile Path of index file based on Lucene. The document indexes * must have fields called "class" and "content". WARNING: The fields must * not contains any puntuaction sign. * * @return Instances of weka. The instances are sparse since it is about * text information. * * @throws FileNotFoundException If the file does not exists. * @throws IOException If happens a error while writing the file. */ public Instances convertLuceneToWekaRegression(String wekaFileName, String indexFile) throws FileNotFoundException, IOException { File nuevo = new File(wekaFileName); if (!verify(nuevo)) { return null; } FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine); IndexSearcher searcher = new IndexSearcher(indexFile); IndexReader reader = searcher.getIndexReader(); int total = reader.maxDoc(); HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2); HashMap<String, Integer> labels = new HashMap<String, Integer>(total * 2); int i; for (int l = 0; l < total; l++) { if (!reader.isDeleted(l)) { TermFreqVector vector = reader.getTermFreqVector(l, content); Document doc = reader.document(l); String current = doc.getField(classF).stringValue(); if (!labels.containsKey(current)) { labels.put(current, labels.size()); } if (vector != null) { String listosI[] = vector.getTerms(); for (i = 0; i < listosI.length; i++) { if (!terms.containsKey(listosI[i])) { terms.put(listosI[i], terms.size()); } } } } } Container[] terminos = convertir(terms); Arrays.sort(terminos); for (int j = 0; j < terminos.length; j++) { FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n"); } FileUtil.writeFile(nuevo, "@ATTRIBUTE class REAL [0.0,"); FileUtil.writeFile(nuevo, (labels.size() - 1) + ".0]" + doubleLine); FileUtil.writeFile(nuevo, "@DATA\n"); for (int pos = 0; pos < searcher.maxDoc(); pos++) { if (!reader.isDeleted(pos)) { TermFreqVector vector = reader.getTermFreqVector(pos, content); if (vector != null) { int[] origen = vector.getTermFrequencies(); String[] termsI = vector.getTerms(); int[] positions = new int[origen.length]; for (int k = 0; k < origen.length; k++) { positions[k] = terms.get(termsI[k]); } Container[] escribir = convertir(positions, origen); Arrays.sort(escribir); FileUtil.writeFile(nuevo, "{"); for (int j = 0; j < escribir.length; j++) { FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ","); } FileUtil.writeFile(nuevo, terms.size() + " " + labels.get(searcher.doc(pos).getField(classF).stringValue()) + ".0}\n"); } } } //close files closeReaders(searcher, reader); //Test if the weka file works Instances test = testWekaFile(wekaFileName); return test; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private Date getLatestEntryDate(ILogResource log) throws CoreException { if (!hasDateComponent(log)) { return null; }// ww w.j a v a2 s.c o m ARunWithIndexReader<Date> runnable = new ARunWithIndexReader<Date>() { /* (non-Javadoc) * @see net.sf.logsaw.index.impl.ARunWithIndexReader#doRunWithIndexReader(org.apache.lucene.index.IndexReader, net.sf.logsaw.core.framework.ILogResource) */ @Override protected Date doRunWithIndexReader(IndexReader reader, ILogResource log) throws CoreException { if (reader == null) { // Index does not exist yet return null; } int i = reader.maxDoc(); if (i > 0) { try { Document doc = reader.document(i - 1); String val = doc.get(log.getDialect().getFieldProvider().getTimestampField().getKey()); return log.getDialect().getFieldProvider().getTimestampField().fromIndexedValue(val); } catch (IOException e) { // Unexpected exception; wrap with CoreException throw new CoreException(new Status(IStatus.ERROR, IndexPlugin.PLUGIN_ID, NLS.bind(Messages.LuceneIndexService_error_failedToReadIndex, new Object[] { log.getName(), e.getLocalizedMessage() }), e)); } } return null; } }; return runnable.runWithIndexReader(log); }
From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); IQuranLocation loc = new QuranLocation(doc.getField("location").stringValue()); if (searchScope.includes(loc)) { bits.set(i);/*from w w w .j av a2s . co m*/ } } return new DocIdBitSet(bits); }
From source file:nl.elucidator.maven.analyzer.indexer.IndexSearcher.java
License:Apache License
public Set<ArtifactInfo> getUniqueGAV() throws IOException, ComponentLookupException { IndexingContext centralContext = indexUpdater.getIndexContext(); centralContext.lock();/* w w w .j a va 2 s. c o m*/ Set<ArtifactInfo> artifactInfoSet = new HashSet<ArtifactInfo>(); try { final IndexReader ir = centralContext.getIndexReader(); for (int i = 0; i < ir.maxDoc(); i++) { if (!ir.isDeleted(i)) { final Document doc = ir.document(i); final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext); if (ai != null) { artifactInfoSet.add(ai); } } } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { centralContext.unlock(); } return artifactInfoSet; }
From source file:nl.inl.blacklab.perdocument.DocResults.java
License:Apache License
/** * Construct DocResults from a Scorer (Lucene document results). * * @param searcher the searcher that generated the results * @param scorer the scorer to read document results from */// ww w. j a v a2 s. c o m DocResults(Searcher searcher, Scorer scorer) { this.searcher = searcher; if (scorer == null) return; // no matches, empty result set try { IndexReader indexReader = searcher.getIndexReader(); while (true) { int docId; try { docId = scorer.nextDoc(); } catch (IOException e) { throw new RuntimeException(e); } if (docId == DocIdSetIterator.NO_MORE_DOCS) break; Document d = indexReader.document(docId); DocResult dr = new DocResult(searcher, null, docId, d, scorer.score()); results.add(dr); } } catch (Exception e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.perdocument.DocResults.java
License:Apache License
private void addDocResultToList(int doc, Hits docHits, IndexReader indexReader) throws IOException { DocResult docResult = new DocResult(searcher, sourceHits.getConcordanceFieldName(), doc, indexReader == null ? null : indexReader.document(doc), docHits); // Make sure we remember what kind of context we have, if any docResult.setContextField(sourceHits.getContextFieldPropName()); results.add(docResult);//w w w . j a v a 2s. c o m }
From source file:nl.uva.mlc.eurovoc.analyzer.PropagationAnalyzer.java
private void testIndexDocReader() { try {/*from w w w . j ava 2s .c o m*/ IndexReader testIreader = IndexReader .open(new SimpleFSDirectory(new File(configFile.getProperty("TEST_INDEX_PATH")))); for (int i = 0; i < testIreader.numDocs(); i++) { String id = testIreader.document(i).get("ID"); String title = testIreader.document(i).get("TITLE"); String text = testIreader.document(i).get("TEXT"); String namedEntities = testIreader.document(i).get("NAMEDENTITIES"); String[] classes = testIreader.document(i).get("CLASSES").split("\\s+"); EuroVocDoc doc = new EuroVocDoc(id, title, text, namedEntities, new ArrayList<String>(Arrays.asList(classes))); Quering(doc); log.info(i + " from " + testIreader.numDocs()); } } catch (IOException ex) { log.error(ex); } }
From source file:nmsu.cs.TFIDFVector.java
License:Open Source License
/** * calculate likelihood from the index/*from ww w .ja va 2s . c o m*/ * @param indexDir * @param lambda */ public void calLikelihoodFromIndex(String indexDir, double lambda) { try { IndexReader ir = IndexReader.open(FSDirectory.open(new File(indexDir))); IndexSearcher is = new IndexSearcher(ir); int numDocs = ir.maxDoc(); double LLH = 0; //vocabulary list List<String> vocab = new ArrayList<String>(); TermEnum te = ir.terms(); //create vocabulary while (te.next()) { String term = te.term().text(); // System.out.println(term); vocab.add(term); } TFIDFVector.vocabulary = vocab; //dataset id to index id Map<Integer, Integer> idMap = new HashMap<Integer, Integer>(); for (int i = 0; i < numDocs; i++) { Document doc = ir.document(i); idMap.put(Integer.parseInt(doc.get("docid")), i); } //o -> a -> o' Map<Integer, Map<Integer, Map<Integer, Double>>> cosineSimMap = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>(); // (o | o') dataset id -> tfidf vector Map<Integer, TFIDFVector> docVectorMap = new HashMap<Integer, TFIDFVector>(); // o -> a -> vector Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap = new HashMap<Integer, Map<Integer, TFIDFVector>>(); Set<Integer> citedSet = new HashSet<Integer>(); //for all citing document for (Map.Entry<Integer, List<Integer>> entry : rawdata.pubId2CiteIds.entrySet()) {//llh for citing documents int citingDatasetID = entry.getKey(); int citingIndexID = idMap.get(citingDatasetID); //set up citing document vector TFIDFVector citingVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citingDatasetID, citingIndexID, numDocs); float sum = citingVector.sum(); // System.out.println(Debugger.getCallerPosition()+" "+citingDatasetID); List<Integer> refList = entry.getValue(); //for all aspects for (Integer aspectID : rawdata.id2Aspect.keySet()) { String aspect = rawdata.id2Aspect.get(aspectID); //set up citing document aspect vector double aspectSim = 0; if (rawdata.id2Docs.get(citingDatasetID).getText().get(aspectID).length() != 0) { TFIDFVector citingAspectVector = BaseLineMethod.getAspectTFIDFVector(docAspectVectorMap, ir, citingDatasetID, citingIndexID, aspectID, numDocs); citingAspectVector.normalizedBy(sum); int refSize = refList.size(); TFIDFVector[] citedVectors = new TFIDFVector[refSize]; double[] cosineSims = new double[refSize]; int count = 0; //for all cited documents of this citing document for (Integer citedDatasetID : refList) { citedSet.add(citedDatasetID); //set up cited document vector int citedIndexID = idMap.get(citedDatasetID); TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citedDatasetID, citedIndexID, numDocs); citedVector.normalize(); aspectSim = TFIDFVector.computeCosineSim(citedVector, citingAspectVector); // System.out.println(Debugger.getCallerPosition()+"\t\t"+aspectSim); System.out.println( citingDatasetID + "\t" + aspectID + "\t" + citedDatasetID + "\t" + aspectSim); citedVectors[count] = citedVector; cosineSims[count] = aspectSim; count++; } double aspectLLH = citingAspectVector.posteriorLLH(citedVectors, cosineSims, lambda); LLH += aspectLLH; } // Util.update3Map(cosineSimMap, citingDatasetID, aspectID, citedDatasetID, aspectSim); } } for (Integer citedDatasetID : citedSet) { int citedIndexID = idMap.get(citedDatasetID); TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citedDatasetID, citedIndexID, numDocs); citedVector.normalize(); LLH += citedVector.priorLLH(); } System.out.println(LLH); is.close(); ir.close(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:opennlp.tools.similarity.apps.solr.IterativeSearchRequestHandler.java
License:Apache License
public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, SolrQueryRequest req, SolrParams params) { //if (!docList.hasScores()) // return docList; int len = docList.size(); if (len < 1) // do nothing return docList; ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance(); DocIterator iter = docList.iterator(); float[] syntMatchScoreArr = new float[len]; String requestExpression = req.getParamString(); String[] exprParts = requestExpression.split("&"); for (String part : exprParts) { if (part.startsWith("q=")) requestExpression = part;/* www .j a v a 2 s . c om*/ } String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":"); // extract phrase query (in double-quotes) String[] queryParts = requestExpression.split("\""); if (queryParts.length >= 2 && queryParts[1].length() > 5) requestExpression = queryParts[1].replace('+', ' '); else if (requestExpression.indexOf(":") > -1) {// still field-based expression requestExpression = requestExpression.replaceAll(fieldNameQuery + ":", "").replace('+', ' ') .replaceAll(" ", " ").replace("q=", ""); } if (fieldNameQuery == null) return docList; if (requestExpression == null || requestExpression.length() < 5 || requestExpression.split(" ").length < 3) return docList; int[] docIDsHits = new int[len]; IndexReader indexReader = req.getSearcher().getIndexReader(); List<Integer> bestMatchesDocIds = new ArrayList<Integer>(); List<Float> bestMatchesScore = new ArrayList<Float>(); List<Pair<Integer, Float>> docIdsScores = new ArrayList<Pair<Integer, Float>>(); try { for (int i = 0; i < docList.size(); ++i) { int docId = iter.nextDoc(); docIDsHits[i] = docId; Document doc = indexReader.document(docId); // get text for event String answerText = doc.get(fieldNameQuery); if (answerText == null) continue; SentencePairMatchResult matchResult = pos.assessRelevance(requestExpression, answerText); float syntMatchScore = new Double( parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult.getMatchResult())) .floatValue(); bestMatchesDocIds.add(docId); bestMatchesScore.add(syntMatchScore); syntMatchScoreArr[i] = (float) syntMatchScore; //*iter.score(); System.out.println(" Matched query = '" + requestExpression + "' with answer = '" + answerText + "' | doc_id = '" + docId); System.out.println(" Match result = '" + matchResult.getMatchResult() + "' with score = '" + syntMatchScore + "';"); docIdsScores.add(new Pair(docId, syntMatchScore)); } } catch (CorruptIndexException e1) { // TODO Auto-generated catch block e1.printStackTrace(); //log.severe("Corrupt index"+e1); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); //log.severe("File read IO / index"+e1); } Collections.sort(docIdsScores, new PairComparable()); for (int i = 0; i < docIdsScores.size(); i++) { bestMatchesDocIds.set(i, docIdsScores.get(i).getFirst()); bestMatchesScore.set(i, docIdsScores.get(i).getSecond()); } System.out.println(bestMatchesScore); float maxScore = docList.maxScore(); // do not change int limit = docIdsScores.size(); int start = 0; DocSlice ds = null; ds = new DocSlice(start, limit, ArrayUtils.toPrimitive(bestMatchesDocIds.toArray(new Integer[0])), ArrayUtils.toPrimitive(bestMatchesScore.toArray(new Float[0])), bestMatchesDocIds.size(), maxScore); return ds; }
From source file:opennlp.tools.similarity.apps.solr.IterativeSearchRequestHandler.java
License:Apache License
private void append(SolrDocumentList results, ScoreDoc[] more, Set<Integer> alreadyFound, Map<String, SchemaField> fields, Map<String, Object> extraFields, float scoreCutoff, IndexReader reader, boolean includeScore) throws IOException { for (ScoreDoc hit : more) { if (alreadyFound.contains(hit.doc)) { continue; }/*from ww w . j a va2 s. co m*/ Document doc = reader.document(hit.doc); SolrDocument sdoc = new SolrDocument(); for (String fieldname : fields.keySet()) { SchemaField sf = fields.get(fieldname); if (sf.stored()) { sdoc.addField(fieldname, doc.get(fieldname)); } } for (String extraField : extraFields.keySet()) { sdoc.addField(extraField, extraFields.get(extraField)); } if (includeScore) { sdoc.addField("score", hit.score); } results.add(sdoc); alreadyFound.add(hit.doc); } }