Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    if (reader instanceof MultiIndexReader) {
        MultiIndexReader multiIndexReader = (MultiIndexReader) reader;

        IndexReader[] indexReaders = multiIndexReader.getIndexReaders();
        DocIdSet[] docIdSets = new DocIdSet[indexReaders.length];
        int[] maxDocs = new int[indexReaders.length];
        for (int i = 0; i < indexReaders.length; i++) {
            IndexReader subReader = indexReaders[i];
            docIdSets[i] = getIndexReaderDocIdSet(subReader, subReader);
            maxDocs[i] = subReader.maxDoc();
        }/*from  w  w w  . ja  v  a2 s .  co  m*/

        return new MultiDocIdSet(docIdSets, maxDocs);
    }
    log.warn(
            "MultiIndexReader was expected but not found. Do not dissect the reader but use it as one instead");

    return getIndexReaderDocIdSet(reader, reader);
}

From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java

License:Apache License

private OpenBitSet createDocIdSet(IndexReader reader) throws IOException {
    final OpenBitSet bits = new OpenBitSet(reader.maxDoc());

    long start = System.currentTimeMillis();
    new IndexSearcher(reader).search(query, new AbstractHitCollector() {

        @Override/*from  w w w . j a  v  a  2s  . c  om*/
        public final void collect(int doc, float score) {
            bits.set(doc); // set bit for hit
        }
    });
    log.info("Creating CachingMultiReaderQueryFilter doc id set took {} ms.",
            String.valueOf(System.currentTimeMillis() - start));
    return bits;
}

From source file:org.infoglue.cms.controllers.kernel.impl.simple.LuceneController.java

License:Open Source License

public Map getIndexInformation() {
    Map info = new HashMap();

    try {/*from   w  w  w  .j a v  a2  s .  c o  m*/
        String index = CmsPropertyHandler.getContextRootPath() + File.separator + "lucene" + File.separator
                + "index";

        boolean indexExists = IndexReader.indexExists(new File(index));
        if (!indexExists) {
            try {
                File INDEX_DIR = new File(index);
                IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer());
                logger.info("Indexing to directory '" + INDEX_DIR + "'...");
                writer.deleteDocuments(new Term("initializer", "true"));
                logger.info("Optimizing...");
                writer.optimize();
                writer.close();
            } catch (Exception e) {
                logger.error("Error creating index:" + e.getMessage(), e);
            }
        }

        IndexReader reader = IndexReader.open(index);
        int maxDoc = reader.maxDoc();
        int numDoc = reader.numDocs();
        long lastModified = IndexReader.lastModified(index);

        info.put("maxDoc", new Integer(maxDoc));
        info.put("numDoc", new Integer(numDoc));
        info.put("lastModified", new Date(lastModified));

        reader.close();
    } catch (Exception e) {
        logger.error("Error creating index:" + e.getMessage(), e);
    }

    return info;
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * The location of a URL in an index; used in the algorithm for updating an
 * index.//from   w w w.ja v a  2  s.  c  o m
 *
 * @return the location of the SpiderUrl in a web oriented DocSearcherIndex,
 *         or -1 if the URL is not in the index
 */
public int spiderIndexNum(int lastFound, String fileName, IndexReader ir) {
    int returnInt = -1;
    synchronized (this) {
        if (lastFound == -1)
            lastFound = 0;
        try {
            Document doc;
            String compareName = "";
            int numDocs = ir.maxDoc();
            for (int i = lastFound; i < numDocs; i++) {
                if (!ir.isDeleted(i)) {
                    doc = ir.document(i);
                    if (doc != null) {
                        compareName = doc.get(FIELD_URL);
                        if (compareName.equals(fileName)) {
                            returnInt = i;
                            break;
                        }
                    }
                }
            }
            if (returnInt == -1) {
                for (int i = lastFound; i > 0; i--) {
                    if (!ir.isDeleted(i)) {
                        doc = ir.document(i);
                        if (doc != null) {
                            compareName = doc.get(FIELD_URL);
                            // System.out.println("Comparing "+compareName+"
                            // to "+fileName);
                            if (compareName.equals(fileName)) {
                                // System.out.println("MATCH FOUND AT "+i);
                                returnInt = i;
                                break;
                            }
                        }
                    }
                }
            }

            if (returnInt == -1)
                ds.setStatus("File " + fileName + " not found in index!");
        } catch (Exception e) {
            logger.error("spiderIndexNum() failed", e);
            ds.setStatus("Error determining if doc is already in index!");
        }
        // finally {
        return returnInt;
        // }
    }
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * Location of a file in a DocSearcher index; used by update algoritm to
 * update an index.//from  w  ww . j  a  v  a  2 s.  c o m
 *
 * @return location of the document in the DocSearcherIndex or -1 if it is
 *         not in there
 */
public int indexNum(int lastFound, String fileName, IndexReader ir) {
    int returnInt = -1;
    synchronized (this) {
        if (lastFound == -1)
            lastFound = 0;
        try {
            Document doc;
            String compareName = "";
            int numDocs = ir.maxDoc();
            for (int i = lastFound; i < numDocs; i++) {
                if (!ir.isDeleted(i)) {
                    doc = ir.document(i);
                    if (doc != null) {
                        compareName = doc.get(FIELD_PATH);
                        if (compareName.equals(fileName)) {
                            returnInt = i;
                            break;
                        }
                    }
                }
            }
            if (returnInt == -1) {
                for (int i = lastFound; i > 0; i--) {
                    if (!ir.isDeleted(i)) {
                        doc = ir.document(i);
                        if (doc != null) {
                            compareName = doc.get(FIELD_PATH);
                            // System.out.println("Comparing "+compareName+"
                            // to "+fileName);
                            if (compareName.equals(fileName)) {
                                // System.out.println("MATCH FOUND AT "+i);
                                returnInt = i;
                                break;
                            }
                        }
                    }
                }
            }

            if (returnInt == -1)
                ds.setStatus("File " + fileName + " not found in index!");
        } catch (Exception e) {
            logger.error("indexNum() failed", e);
            ds.setStatus("Error determining if doc is already in index!");
        }
        return returnInt;
    }
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * Updates a DocSearcherIndex//w w  w.j a v  a 2  s.  c  om
 *
 * @param di  DocSearcherIndex
 */
public void updateIndex(final DocSearcherIndex di) {
    notesBuf = new StringBuffer();
    newItsBuf = new StringBuffer();
    modItsItsBuf = new StringBuffer();
    delItsItsBuf = new StringBuffer();
    totalChanges = 0;
    long curFileSizeBytes = 0;
    int errNum = 0;
    StringBuffer noRobotsBuf = new StringBuffer();
    int numNoIndex = 0;
    // int numErrors = 0;
    StringBuffer failedBuf = new StringBuffer();
    int addedSuccessFully = 0;
    failedBuf.append("\n");
    synchronized (this) {
        if (di.isCdrom()) {
            // do nothing
        } else if (di.getIsSpider()) {
            doSpiderUpdate(di);
        } else if (!di.getPath().toLowerCase().endsWith(".zip")) { // not a zip
                                                                   // archive
            int numUpdates = 0;
            int numRemovals = 0;
            int numNew = 0;
            try {
                IndexReader ir = IndexReader.open(di.getIndexPath());
                int numDocs = ir.maxDoc();
                ds.setStatus(
                        "There are " + numDocs + " docs in index " + di.getName() + "(" + di.getPath() + ")");
                addHeader(di.getName());
                //ArrayList<String> allDocsInIndexx = new ArrayList<String>(); // indexed files
                // ArrayList allDocsInFolder = new ArrayList(); // current files
                // ArrayList newDocsToAdd = new ArrayList(); // files to be added that are new
                ds.setIsWorking(true);
                ds.setProgressMax(numDocs);
                ds.setCurProgressMSG("Updating Modified Files...");
                setInsertMode(1); // note we are looking for modified files

                logger.info("updateIndex() updating " + numDocs + " document from index");

                for (int i = 0; i < numDocs; i++) {
                    if (!ds.getIsWorking()) {
                        break;
                    }
                    if (!ir.isDeleted(i)) {
                        ds.setCurProgress(i);
                        Document doc = ir.document(i);
                        if (doc != null) {
                            String curFiName = doc.get(FIELD_PATH);
                            String curFiModDate = doc.get(FIELD_MODDATE);
                            File testFi = new File(curFiName);

                            // check file not found
                            if (testFi.exists()) {
                                //allDocsInIndex.add(curFiName);
                                String realFileModDate = DateTimeUtils
                                        .getTimeStringForIndex(testFi.lastModified());

                                // check file is changed
                                if (!realFileModDate.equals(curFiModDate)) {
                                    logger.info("updateIndex() updating " + curFiName + " in index");

                                    numUpdates++;
                                    // remove old document
                                    ir.deleteDocument(i);
                                    ir.close();
                                    // open writer to add document once again
                                    ds.setStatus("Reindexing: " + curFiName);
                                    IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(),
                                            false);
                                    // next line should remove too many files open errors
                                    // iw.setUseCompoundFile(true);
                                    addedSuccessFully = addDocToIndex(curFiName, iw, di, di.isCdrom(), null);
                                    iw.close();
                                    // reopen
                                    ir = IndexReader.open(di.getIndexPath());
                                    switch (addedSuccessFully) {
                                    case 1: // error
                                        errNum++;
                                        if (errNum < 8) {
                                            failedBuf.append("\n");
                                            failedBuf.append(curFiName);
                                        }
                                        ds.setStatus(DocSearch.dsErrIdxgFi + " " + curFiName);
                                        break;
                                    case 2: // meta robots = noindex
                                        numNoIndex++;
                                        if (numNoIndex < 8) {
                                            noRobotsBuf.append("\n");
                                            noRobotsBuf.append(curFiName);
                                        }
                                        ds.setStatus("No Indexing Meta Requirement found in : " + curFiName);
                                        break;
                                    default: // OK
                                        numUpdates++;
                                        ds.setStatus("Indexing " + curFiName + " complete.");
                                        break;
                                    } // end of switch
                                }
                            } else {
                                ds.setStatus("Deleting: " + curFiName);
                                logger.info("updateIndex() remove " + curFiName + " from index");
                                ir.deleteDocument(i);
                                addDelNote(doc);
                                numRemovals++;
                            }
                        }
                    }
                    // end for not deleted
                    // else System.out.println("Document was null or
                    // deleted:"+i);
                }
                // end for getting gocs
                ds.resetProgress();

                // now add the new files
                setInsertMode(0);
                ArrayList<String> folderList = new ArrayList<String>();
                folderList.add(di.getPath());
                int startSubNum = Utils.countSlash(di.getPath());
                int maxSubNum = startSubNum + di.getDepth();
                int lastItemNo = 0;
                int curItemNo = 0;
                int lastFound = 0;
                do {
                    // create our folder file
                    if (!ds.getIsWorking()) {
                        break;
                    }
                    String curFolderString = folderList.get(curItemNo);
                    logger.debug("updateIndex() folder=" + curFolderString);

                    File curFolderFile = new File(curFolderString);
                    int curSubNum = Utils.countSlash(curFolderString);
                    // handle any subfolders --> add them to our folderlist
                    String[] foldersString = curFolderFile.list(DocSearch.ff);
                    int numFolders = foldersString.length;
                    for (int i = 0; i < numFolders; i++) {
                        // add them to our folderlist
                        String curFold = curFolderString + pathSep + foldersString[i] + pathSep;
                        curFold = Utils.replaceAll(pathSep + pathSep, curFold, pathSep);
                        folderList.add(curFold);
                        lastItemNo++;
                        // debug output
                    }
                    // end for having more than 0 folder
                    // add our files
                    String[] filesString = curFolderFile.list(DocSearch.wf);
                    int numFiles = filesString.length;
                    ds.setProgressMax(numDocs);
                    ds.setCurProgressMSG("Updating new Files...");

                    for (int i = 0; i < numFiles; i++) {
                        // add them to our folderlist
                        if (!ds.getIsWorking()) {
                            break;
                        }
                        String curFi = curFolderString + pathSep + filesString[i];
                        curFi = Utils.replaceAll(pathSep + pathSep, curFi, pathSep);
                        curFileSizeBytes = FileUtils.getFileSize(curFi);
                        if (curFileSizeBytes > ds.getMaxFileSize()) {
                            logger.debug("updateIndex() skipping " + curFi + " because is to big");
                            ds.setStatus(I18n.getString("skipping_file_too_big") + " (" + curFileSizeBytes
                                    + ") " + filesString[i]);
                        } else {
                            lastFound = indexNum(lastFound, curFi, ir);
                            if (lastFound == -1) {
                                logger.info("updateIndex() adding " + curFi + " to index");

                                ir.close();
                                // open writer to add document once again
                                IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(),
                                        false);
                                addedSuccessFully = addDocToIndex(curFi, iw, di, di.isCdrom(), null);
                                switch (addedSuccessFully) {
                                case 1: // error
                                    errNum++;
                                    if (errNum < 8) {
                                        failedBuf.append("\n");
                                        failedBuf.append(curFi);
                                    }
                                    ds.setStatus(DocSearch.dsErrIdxg + " " + curFi);
                                    break;
                                case 2: // meta robots = noindex
                                    numNoIndex++;
                                    if (numNoIndex < 8) {
                                        noRobotsBuf.append("\n");
                                        noRobotsBuf.append(curFi);
                                    }
                                    ds.setStatus("Document Exlusion (robots = NOINDEX) : " + curFi);
                                    break;
                                default: // OK
                                    numNew++;
                                    ds.setStatus("New Document Added : " + curFi);
                                    break;
                                } // end of switch
                                iw.close();
                                // reopen
                                ir = IndexReader.open(di.getIndexPath());
                            } // end for lastfound not -1
                        } // end for file size not too big
                        ds.setCurProgress(i);
                        ds.resetProgress();
                    }
                    // end for having more than 0 folder
                    // increment our curItem
                    folderList.set(curItemNo, null); // remove memory overhead as you go!
                    curItemNo++;
                    if (curSubNum >= maxSubNum) {
                        break;
                    }
                    if (!ds.getIsWorking()) {
                        break;
                    }
                } while (curItemNo <= lastItemNo);
                //
                ir.close(); // always close!
                StringBuffer updateMSGBuf = new StringBuffer();
                updateMSGBuf.append('\n');
                updateMSGBuf.append(numRemovals).append(" files were removed from index.\n");
                updateMSGBuf.append(numUpdates).append(" files were reindexed.\n");
                updateMSGBuf.append(numNew).append(" new files were added to the index.\n");
                //
                totalChanges = numRemovals + numUpdates + numNew;
                // all our stuff to the notesBuf
                addNote(updateMSGBuf.toString(), "", true);
                // add our new and modified files
                if (numNew > 0) {
                    addNote(I18n.getString("new_files"), "", true);
                    notesBuf.append(newItsBuf);
                }
                //
                if (numUpdates > 0) {
                    addNote(I18n.getString("updated_files"), "", true);
                    notesBuf.append(modItsItsBuf);
                }
                //
                //
                if (numRemovals > 0) {
                    addNote(I18n.getString("deleted_files"), "", true);
                    notesBuf.append(delItsItsBuf);
                }
                //

                addFooter();
                if (errNum == 0) {
                    updateMSGBuf.append("No errors were encountered during this process.");
                    if (numNoIndex > 0) {
                        updateMSGBuf.append("\n\n").append(numNoIndex).append(
                                " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n");
                        updateMSGBuf.append(noRobotsBuf);
                    }
                    ds.showMessage("Update of index " + di.getName() + " Completed", updateMSGBuf.toString());
                } else {
                    updateMSGBuf.append(errNum).append(
                            " errors were encountered during this process.\nThe following files had problems being indexed or re-indexed:\n")
                            .append(failedBuf);
                    if (numNoIndex > 0) {
                        updateMSGBuf.append("\n\n").append(numNoIndex).append(
                                " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n");
                        updateMSGBuf.append(noRobotsBuf);
                    }

                    ds.showMessage("Errors during Update of index " + di.getName(), updateMSGBuf.toString());
                }
            }
            // end of try
            catch (Exception e) {
                logger.error("updateIndex() error during update index " + di.getName(), e);
                ds.showMessage("Error updating index " + di.getName(), e.toString());
            }

            addFooter();
            di.setLastIndexed(DateTimeUtils.getToday());
            ds.setStatus("Update of index " + di.getName() + " completed.");
            ds.setIsWorking(false);
        } else {
            ds.doZipArchiveUpdate(di);
        }
    }
}

From source file:org.jab.docsearch.spider.LinkFinder.java

License:Open Source License

/**
 * Method update/*from w  w  w .java 2s. co m*/
 *
 * @throws IOException
 */
public void update() throws IOException {
    numDeletes = 0;
    numChanges = 0;
    numNew = 0;
    numFails = 0;
    numUnChanged = 0;
    numMetaNoIdx = 0;

    IndexReader ir = IndexReader.open(dsi.getIndexPath());
    int maxNumDocs = ir.maxDoc();
    int maxTotal = maxNumDocs + maxNumDocs / 10;
    int curDocNum = 0;
    if (ds != null) {
        ds.setStatus(DocSearch.dsTtlDxInIdx + " " + maxNumDocs);
        ds.setIsWorking(true);
        ds.setProgressMax(maxTotal * 2);
        ds.setCurProgressMSG("Spidering Files...");
    }

    // assign index location to urls currently in the index
    int lastFound = 0;
    for (SpiderUrl spy : links) {
        curDocNum++;

        if (ds != null) {
            ds.setCurProgress(curDocNum);
            if (!ds.getIsWorking()) {
                break;
            }
        }

        String curFi = spy.getUrl();
        lastFound = ds.idx.spiderIndexNum(lastFound, curFi, ir);
        spy.setIndexLocation(lastFound);

        if (lastFound == -1) {
            logger.debug("update() " + curFi + " currently is not in the index");
        }
    }

    // now iterate over all the spider urls
    int curSpiderNum = getNextUrlNo();
    int totalSpidered = 0;
    while (curSpiderNum != -1) {
        curDocNum++;

        if (ds != null) {
            ds.setCurProgress(curDocNum);
            if (!ds.getIsWorking()) {
                break;
            }
        }

        SpiderUrl curSpider = getSpiderUrl(curSpiderNum);
        int curNumLinksFound = getNumLinksFound();
        int curIdxNum = curSpider.getIndexLocation();
        // TODO is this getsize realy needed, than the url ist in index?
        long curUrlSize = netUtils.getURLSize(curSpider.getUrl());
        String shortUrl = Utils.concatEnd(curSpider.getUrl(), 33);
        String dnldTmpName = getDownloadFileName(curSpider.getContentType(), curSpider.getUrl().toLowerCase());
        String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName);

        // document is to big
        if (curUrlSize > maxFileSizeToGet) {
            logger.debug("update() '" + shortUrl + "' is to big");
            setStatus(I18n.getString("skipping_file_too_big") + " (" + curUrlSize + " > " + maxFileSizeToGet
                    + ") " + shortUrl);
            curSpider.setSize(curUrlSize);
        }
        // document is in index
        else if (curIdxNum != -1) {
            logger.debug("update() '" + shortUrl + "' is in index");
            setStatus(DocSearch.dsCkgFoUpdtsToDoc + " " + shortUrl + " (" + totalSpidered + " / "
                    + curNumLinksFound + ")");

            int curSpiderStatus = netUtils.getURLStatus(curSpider, downloadFile);
            switch (curSpiderStatus) {
            case -1: // broken url
                logger.debug("update() '" + shortUrl + "' is broken");
                setStatus(DocSearch.dsBknLink + " " + shortUrl);
                curSpider.setIsDeadLink(true);
                // remove from index
                ir.deleteDocument(curIdxNum);
                numDeletes++;
                break;
            case 0: // same
                logger.debug("update() '" + shortUrl + "' no changes");
                setStatus(DocSearch.lnkNoChanges + " " + shortUrl);
                numUnChanged++;
                totalSpidered++;
                break;
            case 1: // changed
                logger.debug("update() '" + shortUrl + "' is changed");
                setStatus(DocSearch.dsReIdxgLnk + " " + shortUrl);
                ir.deleteDocument(curIdxNum);
                ir.close();
                iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                // iw.setUseCompoundFile(true);
                int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                iw.close();
                ir = IndexReader.open(dsi.getIndexPath());
                if (curAddedSuccess == 0) {
                    numChanges++;
                    totalSpidered++;
                } else if (curAddedSuccess == 2) {
                    numMetaNoIdx++;
                } else if (curAddedSuccess == 1) {
                    logger.warn("update() indexing failed " + shortUrl);
                    numFails++;
                }

                // get links from downloaded file
                if (isHtml(curSpider.getUrl())) {
                    checkFileForLinks(downloadFile, curSpider.getUrl());
                }
                break;
            }
        }
        // document is not in index
        else {
            logger.debug("update() '" + shortUrl + "' is not in index");
            setStatus(DocSearch.dsSpiderNewUrl + " " + shortUrl + " (" + totalSpidered + " / "
                    + curNumLinksFound + ")");

            boolean downloadOk = netUtils.downloadURLToFile(curSpider, downloadFile);
            if (downloadOk) {
                iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                // iw.setUseCompoundFile(true);
                int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                iw.close();
                ir.close();
                ir = IndexReader.open(dsi.getIndexPath());
                if (curAddedSuccess == 0) {
                    numNew++;
                    totalSpidered++;
                } else if (curAddedSuccess == 2) {
                    numMetaNoIdx++;
                } else if (curAddedSuccess == 1) {
                    logger.warn("update() indexing failed " + shortUrl);
                    numFails++;
                }
                if (isHtml(curSpider.getUrl())) {
                    checkFileForLinks(downloadFile, curSpider.getUrl());
                }
            } else {
                setStatus(DocSearch.dsBknLink + " " + shortUrl);
                curSpider.setIsDeadLink(true);
            }
        }

        // last things to do
        curSpider.setSpidered(true);
        curSpiderNum = getNextUrlNo();
        if (curSpiderNum == -1) {
            break;
        }
        if (totalSpidered > maxTotal) {
            break;
        }

        // delete temp file
        if (!FileUtils.deleteFile(downloadFile)) {
            logger.warn("update() can't delete file '" + downloadFile + "'");
        }
    }

    setStatus(DocSearch.dsSpdrUpdteComp + " " + dsi.getName());
    saveAllLinks();

    // update the date of the index
    dsi.setLastIndexed(DateTimeUtils.getToday());
    ir.close();
    ds.resetProgress();
}

From source file:org.jab.docsearch.utils.MetaReport.java

License:Open Source License

/**
 * doMetaDataReport//from ww  w .  ja  va 2  s .  c o m
 *
 * @param di
 * @param listAll
 * @param pathRequired
 * @param pathText
 * @param authRequired
 * @param authText
 * @param reportFile
 * @param maxDocs
 * @param useDaysOld
 * @param maxDays
 */
private void doMetaDataReport(DocSearcherIndex di, boolean listAll, boolean pathRequired, String pathText,
        boolean authRequired, String authText, String reportFile, int maxDocs, boolean useDaysOld,
        int maxDays) {
    try {
        // intialize our metrics
        int numBadDocs = 0;
        int totalDocs = 0;
        int numGoodDocs = 0;
        String lineSep = Utils.LINE_SEPARATOR;
        StringBuffer documentBuffer = new StringBuffer();
        StringBuffer metaDataReport = new StringBuffer();

        // initialize the reader
        IndexReader ir = IndexReader.open(di.getIndexPath());
        int numDocs = ir.maxDoc();
        ds.setStatus(numDocs + " " + Messages.getString("DocSearch.numDox") + " " + di.getName());

        // write the start of the table
        documentBuffer.append("<table style=\"empty-cells:show\" border=\"1\">").append(lineSep);
        documentBuffer.append("<tr>").append(lineSep);
        int numHdrs = allFields.length;
        for (int z = 0; z < numHdrs; z++) {
            documentBuffer.append("<th valign=\"top\">");
            documentBuffer.append(allFields[z]);
            documentBuffer.append("</th>").append(lineSep);
        }
        documentBuffer.append("</tr>").append(lineSep);
        for (int i = 0; i < numDocs; i++) {
            if (!ir.isDeleted(i)) {
                Document doc = ir.document(i);
                if (doc != null) {
                    boolean curSkip = false;

                    // put in the docs values
                    String path;
                    if (di.getIsWeb()) {
                        path = doc.get(Index.FIELD_URL);
                    } else {
                        path = doc.get(Index.FIELD_PATH);
                    }

                    ds.setStatus("Examining document: " + path);
                    String type = doc.get(Index.FIELD_TYPE);
                    String author = doc.get(Index.FIELD_AUTHOR);
                    String summary = doc.get(Index.FIELD_SUMMARY);
                    String title = doc.get(Index.FIELD_TITLE);
                    String size = doc.get(Index.FIELD_SIZE);
                    String keywords = doc.get(Index.FIELD_KEYWORDS);
                    String date = DateTimeUtils.getDateParsedFromIndex(doc.get(Index.FIELD_MODDATE));

                    // determine if we even need to examine it
                    if (pathRequired) {
                        if (path.indexOf(pathText) == -1) {
                            curSkip = true;
                        }
                    }

                    if (authRequired) {
                        if (author.indexOf(authText) == -1) {
                            curSkip = true;
                        }
                    }

                    // determine if its bad of good
                    if (!curSkip) {
                        totalDocs++;
                        boolean isGood = goodMetaData(title, summary, author, date, keywords, type, useDaysOld,
                                maxDays);

                        // write to our file
                        if (!isGood || listAll) {
                            documentBuffer.append("<tr>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">"); // path
                            documentBuffer.append(path);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(Utils.convertTextToHTML(title));
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(author);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(date);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(Utils.convertTextToHTML(summary));
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(keywords);
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(size);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(type);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("</tr>").append(lineSep);
                        }

                        if (isGood) {
                            ds.setStatus(path + " " + dsNotMsgMeta);
                            numGoodDocs++;
                        } else {
                            ds.setStatus(path + " " + dsMsgMeta);
                            numBadDocs++;
                        }
                    } else {
                        ds.setStatus(dsSkip + " " + path);
                    }
                }
            }

            if (i > maxDocs) {
                break;
            }
        }
        documentBuffer.append("</table>").append(lineSep);

        int percentGood = 0;
        if (totalDocs > 0) {
            percentGood = (numGoodDocs * 100) / totalDocs;
        }

        ds.setStatus("%  " + dsGood + ": " + percentGood + " (" + numGoodDocs + " / " + totalDocs + ", "
                + numBadDocs + " " + dsBad + ").");

        // write complete report with summary
        metaDataReport.append("<html>").append(lineSep);
        metaDataReport.append("<head>").append(lineSep);
        metaDataReport.append("<title>").append(dsMetaRpt).append(' ').append(di.getName()).append("</title>")
                .append(lineSep);
        metaDataReport.append(
                "<meta name=\"description\" content=\"lists documents with poorly searchable meta data\">")
                .append(lineSep);
        metaDataReport.append("<meta name=\"author\" content=\"DocSearcher\">").append(lineSep);
        metaDataReport.append("</head>").append(lineSep);
        metaDataReport.append("<body>").append(lineSep);
        metaDataReport.append("<h1>").append(dsMetaRpt).append(' ').append(di.getName()).append("</h1>")
                .append(lineSep);
        metaDataReport.append("<p align=\"left\"><b>");
        metaDataReport.append(numBadDocs);
        metaDataReport.append("</b> ");
        metaDataReport.append(dsPoorMeta);
        metaDataReport.append(" <br> &amp; <b>");
        metaDataReport.append(numGoodDocs);
        metaDataReport.append("</b> ");
        metaDataReport.append(dsGoodMetaNum);
        metaDataReport.append(".</p>").append(lineSep);
        metaDataReport.append("<p align=\"left\">");
        metaDataReport.append(dsMetaOO);
        metaDataReport.append(" <b>");
        metaDataReport.append(percentGood + "</b> % . </p>");
        metaDataReport.append("<p align=\"left\">");
        metaDataReport.append(dsTblDsc);
        metaDataReport.append(".</p>").append(lineSep);

        // add document buffer
        metaDataReport.append(documentBuffer);

        metaDataReport.append("</body>").append(lineSep);
        metaDataReport.append("</html>").append(lineSep);

        ds.curPage = Messages.getString("DocSearch.report");

        boolean fileSaved = FileUtils.saveFile(reportFile, metaDataReport);
        if (fileSaved) {
            ds.doExternal(reportFile);
        }
    } catch (IOException ioe) {
        logger.fatal("doMetaDataReport() create meta data report failed", ioe);
        ds.setStatus(Messages.getString("DocSearch.statusMetaDataError") + di.getName() + ":" + ioe.toString());
    }
}

From source file:org.lexevs.dao.index.lucenesupport.BaseLuceneIndexTemplate.java

License:Open Source License

public int getMaxDoc() {
    return this.doInIndexReader(new IndexReaderCallback<Integer>() {

        @Override/* w  w  w  . j  ava2s.co  m*/
        public Integer doInIndexReader(IndexReader indexReader) throws Exception {
            return indexReader.maxDoc();
        }
    });
}

From source file:org.metaservice.core.maven.MavenIndexCrawler.java

License:Apache License

public void perform() throws IOException, ComponentLookupException, InvalidVersionSpecificationException {
    // Files where local cache is (if any) and Lucene Index should be located
    File centralLocalCache = new File("target/central-cache");
    File centralIndexDir = new File("target/central-index");

    // Creators we want to use (search for fields it defines)
    List<IndexCreator> indexers = new ArrayList<>();
    indexers.add(plexusContainer.lookup(IndexCreator.class, "min"));
    indexers.add(plexusContainer.lookup(IndexCreator.class, "jarContent"));
    indexers.add(plexusContainer.lookup(IndexCreator.class, "maven-plugin"));

    // Create context for central repository index
    centralContext = indexer.createIndexingContext("central-context", "central", centralLocalCache,
            centralIndexDir, "http://repo1.maven.org/maven2", null, true, true, indexers);

    // Update the index (incremental update will happen if this is not 1st run and files are not deleted)
    // This whole block below should not be executed on every app start, but rather controlled by some configuration
    // since this block will always emit at least one HTTP GET. Central indexes are updated once a week, but
    // other index sources might have different index publishing frequency.
    // Preferred frequency is once a week.
    if (true) {/* www  . j av  a2  s  .c o  m*/
        System.out.println("Updating Index...");
        System.out.println("This might take a while on first run, so please be patient!");
        // Create ResourceFetcher implementation to be used with IndexUpdateRequest
        // Here, we use Wagon based one as shorthand, but all we need is a ResourceFetcher implementation
        TransferListener listener = new AbstractTransferListener() {
            public void transferStarted(TransferEvent transferEvent) {
                System.out.print(" Downloading " + transferEvent.getResource().getName());
            }

            public void transferProgress(TransferEvent transferEvent, byte[] buffer, int length) {
            }

            public void transferCompleted(TransferEvent transferEvent) {
                System.out.println(" - Done");
            }
        };
        ResourceFetcher resourceFetcher = new WagonHelper.WagonFetcher(httpWagon, listener, null, null);

        Date centralContextCurrentTimestamp = centralContext.getTimestamp();
        IndexUpdateRequest updateRequest = new IndexUpdateRequest(centralContext, resourceFetcher);
        IndexUpdateResult updateResult = indexUpdater.fetchAndUpdateIndex(updateRequest);
        if (updateResult.isFullUpdate()) {
            System.out.println("Full update happened!");
        } else if (updateResult.getTimestamp().equals(centralContextCurrentTimestamp)) {
            System.out.println("No update needed, index is up to date!");
        } else {
            System.out.println("Incremental update happened, change covered " + centralContextCurrentTimestamp
                    + " - " + updateResult.getTimestamp() + " period.");
        }

        System.out.println();
    }

    System.out.println();
    System.out.println("Using index");
    System.out.println("===========");
    System.out.println();

    // ====
    // Case:
    // dump all the GAVs
    // NOTE: will not actually execute do this below, is too long to do (Central is HUGE), but is here as code
    // example

    int j = 0;
    if (true) {

        final IndexSearcher searcher = centralContext.acquireIndexSearcher();
        try {
            final IndexReader ir = searcher.getIndexReader();
            for (int i = 0; i < ir.maxDoc(); i++) {
                if (!ir.isDeleted(i)) {
                    j++;
                    final Document doc = ir.document(i);
                    final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext);
                    if (ai != null && "pom".equals(ai.fextension))
                        System.out.println(ai.groupId + ":" + ai.artifactId + ":" + ai.version + ":"
                                + ai.classifier + " (sha1=" + ai.sha1 + ")");

                }
            }
        } finally {
            centralContext.releaseIndexSearcher(searcher);
        }
    }
    System.err.println(j);
    if (j > 0)
        return;

    // ====
    // Case:
    // Search for all GAVs with known G and A and having version greater than V

    final GenericVersionScheme versionScheme = new GenericVersionScheme();
    final String versionString = "1.5.0";
    final Version version = versionScheme.parseVersion(versionString);

    // construct the query for known GA
    final Query groupIdQ = indexer.constructQuery(MAVEN.GROUP_ID,
            new SourcedSearchExpression("org.sonatype.nexus"));
    final Query artifactIdQ = indexer.constructQuery(MAVEN.ARTIFACT_ID,
            new SourcedSearchExpression("nexus-api"));
    final BooleanQuery query = new BooleanQuery();
    query.add(groupIdQ, BooleanClause.Occur.MUST);
    query.add(artifactIdQ, BooleanClause.Occur.MUST);

    // we want "jar" artifacts only
    query.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("jar")),
            BooleanClause.Occur.MUST);
    // we want main artifacts only (no classifier)
    // Note: this below is unfinished API, needs fixing
    query.add(indexer.constructQuery(MAVEN.CLASSIFIER, new SourcedSearchExpression(Field.NOT_PRESENT)),
            BooleanClause.Occur.MUST_NOT);

    // construct the filter to express "V greater than"
    final ArtifactInfoFilter versionFilter = new ArtifactInfoFilter() {
        public boolean accepts(final IndexingContext ctx, final ArtifactInfo ai) {
            try {
                final Version aiV = versionScheme.parseVersion(ai.version);
                // Use ">=" if you are INCLUSIVE
                return aiV.compareTo(version) > 0;
            } catch (InvalidVersionSpecificationException e) {
                // do something here? be safe and include?
                return true;
            }
        }
    };

    System.out.println(
            "Searching for all GAVs with G=org.sonatype.nexus and nexus-api and having V greater than 1.5.0");
    final IteratorSearchRequest request = new IteratorSearchRequest(query,
            Collections.singletonList(centralContext), versionFilter);
    final IteratorSearchResponse response = indexer.searchIterator(request);
    for (ArtifactInfo ai : response) {
        System.out.println(ai.toString());
    }

    // Case:
    // Use index
    // Searching for some artifact
    Query gidQ = indexer.constructQuery(MAVEN.GROUP_ID,
            new SourcedSearchExpression("org.apache.maven.indexer"));
    Query aidQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("indexer-artifact"));

    BooleanQuery bq = new BooleanQuery();
    bq.add(gidQ, BooleanClause.Occur.MUST);
    bq.add(aidQ, BooleanClause.Occur.MUST);

    searchAndDump(indexer, "all artifacts under GA org.apache.maven.indexer:indexer-artifact", bq);

    // Searching for some main artifact
    bq = new BooleanQuery();
    bq.add(gidQ, BooleanClause.Occur.MUST);
    bq.add(aidQ, BooleanClause.Occur.MUST);
    // bq.add( nexusIndexer.constructQuery( MAVEN.CLASSIFIER, new SourcedSearchExpression( "*" ) ), Occur.MUST_NOT
    // );

    searchAndDump(indexer, "main artifacts under GA org.apache.maven.indexer:indexer-artifact", bq);

    // doing sha1 search
    searchAndDump(indexer, "SHA1 7ab67e6b20e5332a7fb4fdf2f019aec4275846c2", indexer.constructQuery(MAVEN.SHA1,
            new SourcedSearchExpression("7ab67e6b20e5332a7fb4fdf2f019aec4275846c2")));

    searchAndDump(indexer, "SHA1 7ab67e6b20 (partial hash)",
            indexer.constructQuery(MAVEN.SHA1, new UserInputSearchExpression("7ab67e6b20")));

    // doing classname search (incomplete classname)
    searchAndDump(indexer,
            "classname DefaultNexusIndexer (note: Central does not publish classes in the index)",
            indexer.constructQuery(MAVEN.CLASSNAMES, new UserInputSearchExpression("DefaultNexusIndexer")));

    // doing search for all "canonical" maven plugins latest versions
    bq = new BooleanQuery();
    bq.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("maven-plugin")),
            BooleanClause.Occur.MUST);
    bq.add(indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.plugins")),
            BooleanClause.Occur.MUST);
    searchGroupedAndDump(indexer, "all \"canonical\" maven plugins", bq, new GAGrouping());

    // close cleanly
    indexer.closeIndexingContext(centralContext, false);
}