List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java
License:Apache License
@Override public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { if (reader instanceof MultiIndexReader) { MultiIndexReader multiIndexReader = (MultiIndexReader) reader; IndexReader[] indexReaders = multiIndexReader.getIndexReaders(); DocIdSet[] docIdSets = new DocIdSet[indexReaders.length]; int[] maxDocs = new int[indexReaders.length]; for (int i = 0; i < indexReaders.length; i++) { IndexReader subReader = indexReaders[i]; docIdSets[i] = getIndexReaderDocIdSet(subReader, subReader); maxDocs[i] = subReader.maxDoc(); }/*from w w w . ja v a2 s . co m*/ return new MultiDocIdSet(docIdSets, maxDocs); } log.warn( "MultiIndexReader was expected but not found. Do not dissect the reader but use it as one instead"); return getIndexReaderDocIdSet(reader, reader); }
From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java
License:Apache License
private OpenBitSet createDocIdSet(IndexReader reader) throws IOException { final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); long start = System.currentTimeMillis(); new IndexSearcher(reader).search(query, new AbstractHitCollector() { @Override/*from w w w . j a v a 2s . c om*/ public final void collect(int doc, float score) { bits.set(doc); // set bit for hit } }); log.info("Creating CachingMultiReaderQueryFilter doc id set took {} ms.", String.valueOf(System.currentTimeMillis() - start)); return bits; }
From source file:org.infoglue.cms.controllers.kernel.impl.simple.LuceneController.java
License:Open Source License
public Map getIndexInformation() { Map info = new HashMap(); try {/*from w w w .j a v a2 s . c o m*/ String index = CmsPropertyHandler.getContextRootPath() + File.separator + "lucene" + File.separator + "index"; boolean indexExists = IndexReader.indexExists(new File(index)); if (!indexExists) { try { File INDEX_DIR = new File(index); IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer()); logger.info("Indexing to directory '" + INDEX_DIR + "'..."); writer.deleteDocuments(new Term("initializer", "true")); logger.info("Optimizing..."); writer.optimize(); writer.close(); } catch (Exception e) { logger.error("Error creating index:" + e.getMessage(), e); } } IndexReader reader = IndexReader.open(index); int maxDoc = reader.maxDoc(); int numDoc = reader.numDocs(); long lastModified = IndexReader.lastModified(index); info.put("maxDoc", new Integer(maxDoc)); info.put("numDoc", new Integer(numDoc)); info.put("lastModified", new Date(lastModified)); reader.close(); } catch (Exception e) { logger.error("Error creating index:" + e.getMessage(), e); } return info; }
From source file:org.jab.docsearch.Index.java
License:Open Source License
/** * The location of a URL in an index; used in the algorithm for updating an * index.//from w w w.ja v a 2 s. c o m * * @return the location of the SpiderUrl in a web oriented DocSearcherIndex, * or -1 if the URL is not in the index */ public int spiderIndexNum(int lastFound, String fileName, IndexReader ir) { int returnInt = -1; synchronized (this) { if (lastFound == -1) lastFound = 0; try { Document doc; String compareName = ""; int numDocs = ir.maxDoc(); for (int i = lastFound; i < numDocs; i++) { if (!ir.isDeleted(i)) { doc = ir.document(i); if (doc != null) { compareName = doc.get(FIELD_URL); if (compareName.equals(fileName)) { returnInt = i; break; } } } } if (returnInt == -1) { for (int i = lastFound; i > 0; i--) { if (!ir.isDeleted(i)) { doc = ir.document(i); if (doc != null) { compareName = doc.get(FIELD_URL); // System.out.println("Comparing "+compareName+" // to "+fileName); if (compareName.equals(fileName)) { // System.out.println("MATCH FOUND AT "+i); returnInt = i; break; } } } } } if (returnInt == -1) ds.setStatus("File " + fileName + " not found in index!"); } catch (Exception e) { logger.error("spiderIndexNum() failed", e); ds.setStatus("Error determining if doc is already in index!"); } // finally { return returnInt; // } } }
From source file:org.jab.docsearch.Index.java
License:Open Source License
/** * Location of a file in a DocSearcher index; used by update algoritm to * update an index.//from w ww . j a v a 2 s. c o m * * @return location of the document in the DocSearcherIndex or -1 if it is * not in there */ public int indexNum(int lastFound, String fileName, IndexReader ir) { int returnInt = -1; synchronized (this) { if (lastFound == -1) lastFound = 0; try { Document doc; String compareName = ""; int numDocs = ir.maxDoc(); for (int i = lastFound; i < numDocs; i++) { if (!ir.isDeleted(i)) { doc = ir.document(i); if (doc != null) { compareName = doc.get(FIELD_PATH); if (compareName.equals(fileName)) { returnInt = i; break; } } } } if (returnInt == -1) { for (int i = lastFound; i > 0; i--) { if (!ir.isDeleted(i)) { doc = ir.document(i); if (doc != null) { compareName = doc.get(FIELD_PATH); // System.out.println("Comparing "+compareName+" // to "+fileName); if (compareName.equals(fileName)) { // System.out.println("MATCH FOUND AT "+i); returnInt = i; break; } } } } } if (returnInt == -1) ds.setStatus("File " + fileName + " not found in index!"); } catch (Exception e) { logger.error("indexNum() failed", e); ds.setStatus("Error determining if doc is already in index!"); } return returnInt; } }
From source file:org.jab.docsearch.Index.java
License:Open Source License
/** * Updates a DocSearcherIndex//w w w.j a v a 2 s. c om * * @param di DocSearcherIndex */ public void updateIndex(final DocSearcherIndex di) { notesBuf = new StringBuffer(); newItsBuf = new StringBuffer(); modItsItsBuf = new StringBuffer(); delItsItsBuf = new StringBuffer(); totalChanges = 0; long curFileSizeBytes = 0; int errNum = 0; StringBuffer noRobotsBuf = new StringBuffer(); int numNoIndex = 0; // int numErrors = 0; StringBuffer failedBuf = new StringBuffer(); int addedSuccessFully = 0; failedBuf.append("\n"); synchronized (this) { if (di.isCdrom()) { // do nothing } else if (di.getIsSpider()) { doSpiderUpdate(di); } else if (!di.getPath().toLowerCase().endsWith(".zip")) { // not a zip // archive int numUpdates = 0; int numRemovals = 0; int numNew = 0; try { IndexReader ir = IndexReader.open(di.getIndexPath()); int numDocs = ir.maxDoc(); ds.setStatus( "There are " + numDocs + " docs in index " + di.getName() + "(" + di.getPath() + ")"); addHeader(di.getName()); //ArrayList<String> allDocsInIndexx = new ArrayList<String>(); // indexed files // ArrayList allDocsInFolder = new ArrayList(); // current files // ArrayList newDocsToAdd = new ArrayList(); // files to be added that are new ds.setIsWorking(true); ds.setProgressMax(numDocs); ds.setCurProgressMSG("Updating Modified Files..."); setInsertMode(1); // note we are looking for modified files logger.info("updateIndex() updating " + numDocs + " document from index"); for (int i = 0; i < numDocs; i++) { if (!ds.getIsWorking()) { break; } if (!ir.isDeleted(i)) { ds.setCurProgress(i); Document doc = ir.document(i); if (doc != null) { String curFiName = doc.get(FIELD_PATH); String curFiModDate = doc.get(FIELD_MODDATE); File testFi = new File(curFiName); // check file not found if (testFi.exists()) { //allDocsInIndex.add(curFiName); String realFileModDate = DateTimeUtils .getTimeStringForIndex(testFi.lastModified()); // check file is changed if (!realFileModDate.equals(curFiModDate)) { logger.info("updateIndex() updating " + curFiName + " in index"); numUpdates++; // remove old document ir.deleteDocument(i); ir.close(); // open writer to add document once again ds.setStatus("Reindexing: " + curFiName); IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(), false); // next line should remove too many files open errors // iw.setUseCompoundFile(true); addedSuccessFully = addDocToIndex(curFiName, iw, di, di.isCdrom(), null); iw.close(); // reopen ir = IndexReader.open(di.getIndexPath()); switch (addedSuccessFully) { case 1: // error errNum++; if (errNum < 8) { failedBuf.append("\n"); failedBuf.append(curFiName); } ds.setStatus(DocSearch.dsErrIdxgFi + " " + curFiName); break; case 2: // meta robots = noindex numNoIndex++; if (numNoIndex < 8) { noRobotsBuf.append("\n"); noRobotsBuf.append(curFiName); } ds.setStatus("No Indexing Meta Requirement found in : " + curFiName); break; default: // OK numUpdates++; ds.setStatus("Indexing " + curFiName + " complete."); break; } // end of switch } } else { ds.setStatus("Deleting: " + curFiName); logger.info("updateIndex() remove " + curFiName + " from index"); ir.deleteDocument(i); addDelNote(doc); numRemovals++; } } } // end for not deleted // else System.out.println("Document was null or // deleted:"+i); } // end for getting gocs ds.resetProgress(); // now add the new files setInsertMode(0); ArrayList<String> folderList = new ArrayList<String>(); folderList.add(di.getPath()); int startSubNum = Utils.countSlash(di.getPath()); int maxSubNum = startSubNum + di.getDepth(); int lastItemNo = 0; int curItemNo = 0; int lastFound = 0; do { // create our folder file if (!ds.getIsWorking()) { break; } String curFolderString = folderList.get(curItemNo); logger.debug("updateIndex() folder=" + curFolderString); File curFolderFile = new File(curFolderString); int curSubNum = Utils.countSlash(curFolderString); // handle any subfolders --> add them to our folderlist String[] foldersString = curFolderFile.list(DocSearch.ff); int numFolders = foldersString.length; for (int i = 0; i < numFolders; i++) { // add them to our folderlist String curFold = curFolderString + pathSep + foldersString[i] + pathSep; curFold = Utils.replaceAll(pathSep + pathSep, curFold, pathSep); folderList.add(curFold); lastItemNo++; // debug output } // end for having more than 0 folder // add our files String[] filesString = curFolderFile.list(DocSearch.wf); int numFiles = filesString.length; ds.setProgressMax(numDocs); ds.setCurProgressMSG("Updating new Files..."); for (int i = 0; i < numFiles; i++) { // add them to our folderlist if (!ds.getIsWorking()) { break; } String curFi = curFolderString + pathSep + filesString[i]; curFi = Utils.replaceAll(pathSep + pathSep, curFi, pathSep); curFileSizeBytes = FileUtils.getFileSize(curFi); if (curFileSizeBytes > ds.getMaxFileSize()) { logger.debug("updateIndex() skipping " + curFi + " because is to big"); ds.setStatus(I18n.getString("skipping_file_too_big") + " (" + curFileSizeBytes + ") " + filesString[i]); } else { lastFound = indexNum(lastFound, curFi, ir); if (lastFound == -1) { logger.info("updateIndex() adding " + curFi + " to index"); ir.close(); // open writer to add document once again IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(), false); addedSuccessFully = addDocToIndex(curFi, iw, di, di.isCdrom(), null); switch (addedSuccessFully) { case 1: // error errNum++; if (errNum < 8) { failedBuf.append("\n"); failedBuf.append(curFi); } ds.setStatus(DocSearch.dsErrIdxg + " " + curFi); break; case 2: // meta robots = noindex numNoIndex++; if (numNoIndex < 8) { noRobotsBuf.append("\n"); noRobotsBuf.append(curFi); } ds.setStatus("Document Exlusion (robots = NOINDEX) : " + curFi); break; default: // OK numNew++; ds.setStatus("New Document Added : " + curFi); break; } // end of switch iw.close(); // reopen ir = IndexReader.open(di.getIndexPath()); } // end for lastfound not -1 } // end for file size not too big ds.setCurProgress(i); ds.resetProgress(); } // end for having more than 0 folder // increment our curItem folderList.set(curItemNo, null); // remove memory overhead as you go! curItemNo++; if (curSubNum >= maxSubNum) { break; } if (!ds.getIsWorking()) { break; } } while (curItemNo <= lastItemNo); // ir.close(); // always close! StringBuffer updateMSGBuf = new StringBuffer(); updateMSGBuf.append('\n'); updateMSGBuf.append(numRemovals).append(" files were removed from index.\n"); updateMSGBuf.append(numUpdates).append(" files were reindexed.\n"); updateMSGBuf.append(numNew).append(" new files were added to the index.\n"); // totalChanges = numRemovals + numUpdates + numNew; // all our stuff to the notesBuf addNote(updateMSGBuf.toString(), "", true); // add our new and modified files if (numNew > 0) { addNote(I18n.getString("new_files"), "", true); notesBuf.append(newItsBuf); } // if (numUpdates > 0) { addNote(I18n.getString("updated_files"), "", true); notesBuf.append(modItsItsBuf); } // // if (numRemovals > 0) { addNote(I18n.getString("deleted_files"), "", true); notesBuf.append(delItsItsBuf); } // addFooter(); if (errNum == 0) { updateMSGBuf.append("No errors were encountered during this process."); if (numNoIndex > 0) { updateMSGBuf.append("\n\n").append(numNoIndex).append( " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n"); updateMSGBuf.append(noRobotsBuf); } ds.showMessage("Update of index " + di.getName() + " Completed", updateMSGBuf.toString()); } else { updateMSGBuf.append(errNum).append( " errors were encountered during this process.\nThe following files had problems being indexed or re-indexed:\n") .append(failedBuf); if (numNoIndex > 0) { updateMSGBuf.append("\n\n").append(numNoIndex).append( " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n"); updateMSGBuf.append(noRobotsBuf); } ds.showMessage("Errors during Update of index " + di.getName(), updateMSGBuf.toString()); } } // end of try catch (Exception e) { logger.error("updateIndex() error during update index " + di.getName(), e); ds.showMessage("Error updating index " + di.getName(), e.toString()); } addFooter(); di.setLastIndexed(DateTimeUtils.getToday()); ds.setStatus("Update of index " + di.getName() + " completed."); ds.setIsWorking(false); } else { ds.doZipArchiveUpdate(di); } } }
From source file:org.jab.docsearch.spider.LinkFinder.java
License:Open Source License
/** * Method update/*from w w w .java 2s. co m*/ * * @throws IOException */ public void update() throws IOException { numDeletes = 0; numChanges = 0; numNew = 0; numFails = 0; numUnChanged = 0; numMetaNoIdx = 0; IndexReader ir = IndexReader.open(dsi.getIndexPath()); int maxNumDocs = ir.maxDoc(); int maxTotal = maxNumDocs + maxNumDocs / 10; int curDocNum = 0; if (ds != null) { ds.setStatus(DocSearch.dsTtlDxInIdx + " " + maxNumDocs); ds.setIsWorking(true); ds.setProgressMax(maxTotal * 2); ds.setCurProgressMSG("Spidering Files..."); } // assign index location to urls currently in the index int lastFound = 0; for (SpiderUrl spy : links) { curDocNum++; if (ds != null) { ds.setCurProgress(curDocNum); if (!ds.getIsWorking()) { break; } } String curFi = spy.getUrl(); lastFound = ds.idx.spiderIndexNum(lastFound, curFi, ir); spy.setIndexLocation(lastFound); if (lastFound == -1) { logger.debug("update() " + curFi + " currently is not in the index"); } } // now iterate over all the spider urls int curSpiderNum = getNextUrlNo(); int totalSpidered = 0; while (curSpiderNum != -1) { curDocNum++; if (ds != null) { ds.setCurProgress(curDocNum); if (!ds.getIsWorking()) { break; } } SpiderUrl curSpider = getSpiderUrl(curSpiderNum); int curNumLinksFound = getNumLinksFound(); int curIdxNum = curSpider.getIndexLocation(); // TODO is this getsize realy needed, than the url ist in index? long curUrlSize = netUtils.getURLSize(curSpider.getUrl()); String shortUrl = Utils.concatEnd(curSpider.getUrl(), 33); String dnldTmpName = getDownloadFileName(curSpider.getContentType(), curSpider.getUrl().toLowerCase()); String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName); // document is to big if (curUrlSize > maxFileSizeToGet) { logger.debug("update() '" + shortUrl + "' is to big"); setStatus(I18n.getString("skipping_file_too_big") + " (" + curUrlSize + " > " + maxFileSizeToGet + ") " + shortUrl); curSpider.setSize(curUrlSize); } // document is in index else if (curIdxNum != -1) { logger.debug("update() '" + shortUrl + "' is in index"); setStatus(DocSearch.dsCkgFoUpdtsToDoc + " " + shortUrl + " (" + totalSpidered + " / " + curNumLinksFound + ")"); int curSpiderStatus = netUtils.getURLStatus(curSpider, downloadFile); switch (curSpiderStatus) { case -1: // broken url logger.debug("update() '" + shortUrl + "' is broken"); setStatus(DocSearch.dsBknLink + " " + shortUrl); curSpider.setIsDeadLink(true); // remove from index ir.deleteDocument(curIdxNum); numDeletes++; break; case 0: // same logger.debug("update() '" + shortUrl + "' no changes"); setStatus(DocSearch.lnkNoChanges + " " + shortUrl); numUnChanged++; totalSpidered++; break; case 1: // changed logger.debug("update() '" + shortUrl + "' is changed"); setStatus(DocSearch.dsReIdxgLnk + " " + shortUrl); ir.deleteDocument(curIdxNum); ir.close(); iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false); // iw.setUseCompoundFile(true); int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider); iw.close(); ir = IndexReader.open(dsi.getIndexPath()); if (curAddedSuccess == 0) { numChanges++; totalSpidered++; } else if (curAddedSuccess == 2) { numMetaNoIdx++; } else if (curAddedSuccess == 1) { logger.warn("update() indexing failed " + shortUrl); numFails++; } // get links from downloaded file if (isHtml(curSpider.getUrl())) { checkFileForLinks(downloadFile, curSpider.getUrl()); } break; } } // document is not in index else { logger.debug("update() '" + shortUrl + "' is not in index"); setStatus(DocSearch.dsSpiderNewUrl + " " + shortUrl + " (" + totalSpidered + " / " + curNumLinksFound + ")"); boolean downloadOk = netUtils.downloadURLToFile(curSpider, downloadFile); if (downloadOk) { iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false); // iw.setUseCompoundFile(true); int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider); iw.close(); ir.close(); ir = IndexReader.open(dsi.getIndexPath()); if (curAddedSuccess == 0) { numNew++; totalSpidered++; } else if (curAddedSuccess == 2) { numMetaNoIdx++; } else if (curAddedSuccess == 1) { logger.warn("update() indexing failed " + shortUrl); numFails++; } if (isHtml(curSpider.getUrl())) { checkFileForLinks(downloadFile, curSpider.getUrl()); } } else { setStatus(DocSearch.dsBknLink + " " + shortUrl); curSpider.setIsDeadLink(true); } } // last things to do curSpider.setSpidered(true); curSpiderNum = getNextUrlNo(); if (curSpiderNum == -1) { break; } if (totalSpidered > maxTotal) { break; } // delete temp file if (!FileUtils.deleteFile(downloadFile)) { logger.warn("update() can't delete file '" + downloadFile + "'"); } } setStatus(DocSearch.dsSpdrUpdteComp + " " + dsi.getName()); saveAllLinks(); // update the date of the index dsi.setLastIndexed(DateTimeUtils.getToday()); ir.close(); ds.resetProgress(); }
From source file:org.jab.docsearch.utils.MetaReport.java
License:Open Source License
/** * doMetaDataReport//from ww w . ja va 2 s . c o m * * @param di * @param listAll * @param pathRequired * @param pathText * @param authRequired * @param authText * @param reportFile * @param maxDocs * @param useDaysOld * @param maxDays */ private void doMetaDataReport(DocSearcherIndex di, boolean listAll, boolean pathRequired, String pathText, boolean authRequired, String authText, String reportFile, int maxDocs, boolean useDaysOld, int maxDays) { try { // intialize our metrics int numBadDocs = 0; int totalDocs = 0; int numGoodDocs = 0; String lineSep = Utils.LINE_SEPARATOR; StringBuffer documentBuffer = new StringBuffer(); StringBuffer metaDataReport = new StringBuffer(); // initialize the reader IndexReader ir = IndexReader.open(di.getIndexPath()); int numDocs = ir.maxDoc(); ds.setStatus(numDocs + " " + Messages.getString("DocSearch.numDox") + " " + di.getName()); // write the start of the table documentBuffer.append("<table style=\"empty-cells:show\" border=\"1\">").append(lineSep); documentBuffer.append("<tr>").append(lineSep); int numHdrs = allFields.length; for (int z = 0; z < numHdrs; z++) { documentBuffer.append("<th valign=\"top\">"); documentBuffer.append(allFields[z]); documentBuffer.append("</th>").append(lineSep); } documentBuffer.append("</tr>").append(lineSep); for (int i = 0; i < numDocs; i++) { if (!ir.isDeleted(i)) { Document doc = ir.document(i); if (doc != null) { boolean curSkip = false; // put in the docs values String path; if (di.getIsWeb()) { path = doc.get(Index.FIELD_URL); } else { path = doc.get(Index.FIELD_PATH); } ds.setStatus("Examining document: " + path); String type = doc.get(Index.FIELD_TYPE); String author = doc.get(Index.FIELD_AUTHOR); String summary = doc.get(Index.FIELD_SUMMARY); String title = doc.get(Index.FIELD_TITLE); String size = doc.get(Index.FIELD_SIZE); String keywords = doc.get(Index.FIELD_KEYWORDS); String date = DateTimeUtils.getDateParsedFromIndex(doc.get(Index.FIELD_MODDATE)); // determine if we even need to examine it if (pathRequired) { if (path.indexOf(pathText) == -1) { curSkip = true; } } if (authRequired) { if (author.indexOf(authText) == -1) { curSkip = true; } } // determine if its bad of good if (!curSkip) { totalDocs++; boolean isGood = goodMetaData(title, summary, author, date, keywords, type, useDaysOld, maxDays); // write to our file if (!isGood || listAll) { documentBuffer.append("<tr>").append(lineSep); documentBuffer.append("<td valign=\"top\">"); // path documentBuffer.append(path); documentBuffer.append("</td>").append(lineSep); documentBuffer.append("<td valign=\"top\"><small>"); documentBuffer.append(Utils.convertTextToHTML(title)); documentBuffer.append("</small></td>").append(lineSep); documentBuffer.append("<td valign=\"top\">"); documentBuffer.append(author); documentBuffer.append("</td>").append(lineSep); documentBuffer.append("<td valign=\"top\">"); documentBuffer.append(date); documentBuffer.append("</td>").append(lineSep); documentBuffer.append("<td valign=\"top\"><small>"); documentBuffer.append(Utils.convertTextToHTML(summary)); documentBuffer.append("</small></td>").append(lineSep); documentBuffer.append("<td valign=\"top\"><small>"); documentBuffer.append(keywords); documentBuffer.append("</small></td>").append(lineSep); documentBuffer.append("<td valign=\"top\">"); documentBuffer.append(size); documentBuffer.append("</td>").append(lineSep); documentBuffer.append("<td valign=\"top\">"); documentBuffer.append(type); documentBuffer.append("</td>").append(lineSep); documentBuffer.append("</tr>").append(lineSep); } if (isGood) { ds.setStatus(path + " " + dsNotMsgMeta); numGoodDocs++; } else { ds.setStatus(path + " " + dsMsgMeta); numBadDocs++; } } else { ds.setStatus(dsSkip + " " + path); } } } if (i > maxDocs) { break; } } documentBuffer.append("</table>").append(lineSep); int percentGood = 0; if (totalDocs > 0) { percentGood = (numGoodDocs * 100) / totalDocs; } ds.setStatus("% " + dsGood + ": " + percentGood + " (" + numGoodDocs + " / " + totalDocs + ", " + numBadDocs + " " + dsBad + ")."); // write complete report with summary metaDataReport.append("<html>").append(lineSep); metaDataReport.append("<head>").append(lineSep); metaDataReport.append("<title>").append(dsMetaRpt).append(' ').append(di.getName()).append("</title>") .append(lineSep); metaDataReport.append( "<meta name=\"description\" content=\"lists documents with poorly searchable meta data\">") .append(lineSep); metaDataReport.append("<meta name=\"author\" content=\"DocSearcher\">").append(lineSep); metaDataReport.append("</head>").append(lineSep); metaDataReport.append("<body>").append(lineSep); metaDataReport.append("<h1>").append(dsMetaRpt).append(' ').append(di.getName()).append("</h1>") .append(lineSep); metaDataReport.append("<p align=\"left\"><b>"); metaDataReport.append(numBadDocs); metaDataReport.append("</b> "); metaDataReport.append(dsPoorMeta); metaDataReport.append(" <br> & <b>"); metaDataReport.append(numGoodDocs); metaDataReport.append("</b> "); metaDataReport.append(dsGoodMetaNum); metaDataReport.append(".</p>").append(lineSep); metaDataReport.append("<p align=\"left\">"); metaDataReport.append(dsMetaOO); metaDataReport.append(" <b>"); metaDataReport.append(percentGood + "</b> % . </p>"); metaDataReport.append("<p align=\"left\">"); metaDataReport.append(dsTblDsc); metaDataReport.append(".</p>").append(lineSep); // add document buffer metaDataReport.append(documentBuffer); metaDataReport.append("</body>").append(lineSep); metaDataReport.append("</html>").append(lineSep); ds.curPage = Messages.getString("DocSearch.report"); boolean fileSaved = FileUtils.saveFile(reportFile, metaDataReport); if (fileSaved) { ds.doExternal(reportFile); } } catch (IOException ioe) { logger.fatal("doMetaDataReport() create meta data report failed", ioe); ds.setStatus(Messages.getString("DocSearch.statusMetaDataError") + di.getName() + ":" + ioe.toString()); } }
From source file:org.lexevs.dao.index.lucenesupport.BaseLuceneIndexTemplate.java
License:Open Source License
public int getMaxDoc() { return this.doInIndexReader(new IndexReaderCallback<Integer>() { @Override/* w w w . j ava2s.co m*/ public Integer doInIndexReader(IndexReader indexReader) throws Exception { return indexReader.maxDoc(); } }); }
From source file:org.metaservice.core.maven.MavenIndexCrawler.java
License:Apache License
public void perform() throws IOException, ComponentLookupException, InvalidVersionSpecificationException { // Files where local cache is (if any) and Lucene Index should be located File centralLocalCache = new File("target/central-cache"); File centralIndexDir = new File("target/central-index"); // Creators we want to use (search for fields it defines) List<IndexCreator> indexers = new ArrayList<>(); indexers.add(plexusContainer.lookup(IndexCreator.class, "min")); indexers.add(plexusContainer.lookup(IndexCreator.class, "jarContent")); indexers.add(plexusContainer.lookup(IndexCreator.class, "maven-plugin")); // Create context for central repository index centralContext = indexer.createIndexingContext("central-context", "central", centralLocalCache, centralIndexDir, "http://repo1.maven.org/maven2", null, true, true, indexers); // Update the index (incremental update will happen if this is not 1st run and files are not deleted) // This whole block below should not be executed on every app start, but rather controlled by some configuration // since this block will always emit at least one HTTP GET. Central indexes are updated once a week, but // other index sources might have different index publishing frequency. // Preferred frequency is once a week. if (true) {/* www . j av a2 s .c o m*/ System.out.println("Updating Index..."); System.out.println("This might take a while on first run, so please be patient!"); // Create ResourceFetcher implementation to be used with IndexUpdateRequest // Here, we use Wagon based one as shorthand, but all we need is a ResourceFetcher implementation TransferListener listener = new AbstractTransferListener() { public void transferStarted(TransferEvent transferEvent) { System.out.print(" Downloading " + transferEvent.getResource().getName()); } public void transferProgress(TransferEvent transferEvent, byte[] buffer, int length) { } public void transferCompleted(TransferEvent transferEvent) { System.out.println(" - Done"); } }; ResourceFetcher resourceFetcher = new WagonHelper.WagonFetcher(httpWagon, listener, null, null); Date centralContextCurrentTimestamp = centralContext.getTimestamp(); IndexUpdateRequest updateRequest = new IndexUpdateRequest(centralContext, resourceFetcher); IndexUpdateResult updateResult = indexUpdater.fetchAndUpdateIndex(updateRequest); if (updateResult.isFullUpdate()) { System.out.println("Full update happened!"); } else if (updateResult.getTimestamp().equals(centralContextCurrentTimestamp)) { System.out.println("No update needed, index is up to date!"); } else { System.out.println("Incremental update happened, change covered " + centralContextCurrentTimestamp + " - " + updateResult.getTimestamp() + " period."); } System.out.println(); } System.out.println(); System.out.println("Using index"); System.out.println("==========="); System.out.println(); // ==== // Case: // dump all the GAVs // NOTE: will not actually execute do this below, is too long to do (Central is HUGE), but is here as code // example int j = 0; if (true) { final IndexSearcher searcher = centralContext.acquireIndexSearcher(); try { final IndexReader ir = searcher.getIndexReader(); for (int i = 0; i < ir.maxDoc(); i++) { if (!ir.isDeleted(i)) { j++; final Document doc = ir.document(i); final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext); if (ai != null && "pom".equals(ai.fextension)) System.out.println(ai.groupId + ":" + ai.artifactId + ":" + ai.version + ":" + ai.classifier + " (sha1=" + ai.sha1 + ")"); } } } finally { centralContext.releaseIndexSearcher(searcher); } } System.err.println(j); if (j > 0) return; // ==== // Case: // Search for all GAVs with known G and A and having version greater than V final GenericVersionScheme versionScheme = new GenericVersionScheme(); final String versionString = "1.5.0"; final Version version = versionScheme.parseVersion(versionString); // construct the query for known GA final Query groupIdQ = indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.sonatype.nexus")); final Query artifactIdQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("nexus-api")); final BooleanQuery query = new BooleanQuery(); query.add(groupIdQ, BooleanClause.Occur.MUST); query.add(artifactIdQ, BooleanClause.Occur.MUST); // we want "jar" artifacts only query.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("jar")), BooleanClause.Occur.MUST); // we want main artifacts only (no classifier) // Note: this below is unfinished API, needs fixing query.add(indexer.constructQuery(MAVEN.CLASSIFIER, new SourcedSearchExpression(Field.NOT_PRESENT)), BooleanClause.Occur.MUST_NOT); // construct the filter to express "V greater than" final ArtifactInfoFilter versionFilter = new ArtifactInfoFilter() { public boolean accepts(final IndexingContext ctx, final ArtifactInfo ai) { try { final Version aiV = versionScheme.parseVersion(ai.version); // Use ">=" if you are INCLUSIVE return aiV.compareTo(version) > 0; } catch (InvalidVersionSpecificationException e) { // do something here? be safe and include? return true; } } }; System.out.println( "Searching for all GAVs with G=org.sonatype.nexus and nexus-api and having V greater than 1.5.0"); final IteratorSearchRequest request = new IteratorSearchRequest(query, Collections.singletonList(centralContext), versionFilter); final IteratorSearchResponse response = indexer.searchIterator(request); for (ArtifactInfo ai : response) { System.out.println(ai.toString()); } // Case: // Use index // Searching for some artifact Query gidQ = indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.indexer")); Query aidQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("indexer-artifact")); BooleanQuery bq = new BooleanQuery(); bq.add(gidQ, BooleanClause.Occur.MUST); bq.add(aidQ, BooleanClause.Occur.MUST); searchAndDump(indexer, "all artifacts under GA org.apache.maven.indexer:indexer-artifact", bq); // Searching for some main artifact bq = new BooleanQuery(); bq.add(gidQ, BooleanClause.Occur.MUST); bq.add(aidQ, BooleanClause.Occur.MUST); // bq.add( nexusIndexer.constructQuery( MAVEN.CLASSIFIER, new SourcedSearchExpression( "*" ) ), Occur.MUST_NOT // ); searchAndDump(indexer, "main artifacts under GA org.apache.maven.indexer:indexer-artifact", bq); // doing sha1 search searchAndDump(indexer, "SHA1 7ab67e6b20e5332a7fb4fdf2f019aec4275846c2", indexer.constructQuery(MAVEN.SHA1, new SourcedSearchExpression("7ab67e6b20e5332a7fb4fdf2f019aec4275846c2"))); searchAndDump(indexer, "SHA1 7ab67e6b20 (partial hash)", indexer.constructQuery(MAVEN.SHA1, new UserInputSearchExpression("7ab67e6b20"))); // doing classname search (incomplete classname) searchAndDump(indexer, "classname DefaultNexusIndexer (note: Central does not publish classes in the index)", indexer.constructQuery(MAVEN.CLASSNAMES, new UserInputSearchExpression("DefaultNexusIndexer"))); // doing search for all "canonical" maven plugins latest versions bq = new BooleanQuery(); bq.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("maven-plugin")), BooleanClause.Occur.MUST); bq.add(indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.plugins")), BooleanClause.Occur.MUST); searchGroupedAndDump(indexer, "all \"canonical\" maven plugins", bq, new GAGrouping()); // close cleanly indexer.closeIndexingContext(centralContext, false); }