Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:de.linguatools.disco.DISCO.java

License:Apache License

/***************************************************************************
 * Run trough all documents (i.e. queryable words) in the index, and retrieve
 * the word and its frequency. Write both informations to the file named
 * outputFileName. This method can be used to check index integrity.<br/>
 * @param outputFileName//ww  w.ja  va2s .  c  om
 * @return number of words written to the output file. In case of success the
 * value is equal to the number of words in the index.
 */
public int wordFrequencyList(String outputFileName) {

    // erzeuge einen IndexReader fuer das indexDir
    IndexReader ir = null;
    try {
        if (indexRAM != null) {
            ir = IndexReader.open(indexRAM);
        } else {
            ir = IndexReader.open(FSDirectory.open(new File(indexName)));
        }
    } catch (CorruptIndexException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // Hole Anzahl Dokumente im Index
    int N = ir.numDocs();

    // ffne Ausgabedatei
    FileWriter fw;
    try {
        fw = new FileWriter(outputFileName);
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // durchlaufe alle Dokumente
    int corrupt = 0;
    int ioerror = 0;
    int i = 0;
    for (i = 0; i < N; i++) {
        Document doc = null;
        try {
            doc = ir.document(i);
        } catch (CorruptIndexException ex) {
            corrupt++;
            continue;
        } catch (IOException ex) {
            ioerror++;
            continue;
        }
        // Wort Nr. i holen
        String word = doc.get("word");
        // Frequenz von Wort i holen
        int f = Integer.parseInt(doc.get("freq"));
        try {
            // Wort und Frequenz in Ausgabe schreiben
            fw.write(word + "\t" + f + "\n");
        } catch (IOException ex) {
            System.out.println(DISCO.class.getName() + ": word " + i + ": " + ex);
            return i;
        }
        // Info ausgeben
        if (i % 100 == 0) {
            System.out.print("\r" + i);
        }
    }
    System.out.println();
    if (corrupt > 0 || ioerror > 0) {
        int e = corrupt + ioerror;
        System.out.println("*** WARNING! ***");
        System.out.println("The language data packet \"" + indexName + "\" " + "has " + e + " defect entries ("
                + corrupt + " corrupt, " + ioerror + " IO errors)");
        System.out.println("All functioning words have been written to " + outputFileName);
    }

    // aufrumen
    try {
        fw.close();
        ir.close();
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    return (i - corrupt - ioerror);
}

From source file:de.schlund.pfixcore.lucefix.PfixReadjustment.java

License:Open Source License

/**
 * Checks list of include parts for changes and updates search index.
 *///  ww w.  j av  a2s  . c  o m
public void readjust() {
    Collection<Tripel> partsKnownByPustefix = getUsedTripels();
    IndexReader reader = null;
    PfixQueueManager queue;
    boolean jobDone;
    long startLoop, stopLoop, startCollect, stopCollect, startIndexLoop, stopIndexLoop, startAddLoop,
            stopAddLoop;

    long collectTime = 0;

    int knownDocsSize, newDocs, deleteDocs, numDocs;

    startLoop = stopLoop = startCollect = stopCollect = startIndexLoop = stopIndexLoop = startAddLoop = stopAddLoop = 0;
    newDocs = knownDocsSize = deleteDocs = numDocs = 0;

    startLoop = System.currentTimeMillis();
    Set<Tripel> tripelsToIndex = new TreeSet<Tripel>();

    queue = PfixQueueManager.getInstance(null);
    try {
        jobDone = false;
        startCollect = System.currentTimeMillis();
        partsKnownByPustefix = getUsedTripels();
        stopCollect = System.currentTimeMillis();
        collectTime = stopCollect - startCollect;
        knownDocsSize = partsKnownByPustefix.size();

        try {
            reader = IndexReader.open(LUCENE_DATA);
        } catch (IOException ioe) {
            LOG.warn("broken or nonexistant database -> will queue ALL known parts");

            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert");
                }
            }
            jobDone = true;
        }
        if (!jobDone) {
            numDocs = reader.numDocs();
            startIndexLoop = System.currentTimeMillis();
            docloop: for (int i = 0; i < numDocs; i++) {

                Document currentdoc;
                try {
                    currentdoc = reader.document(i);
                } catch (RuntimeException e) {
                    // this happens if we want to access a deleted
                    // document -> continue
                    continue docloop;
                }

                // check if needed
                String path = currentdoc.get(PreDoc.PATH);
                Tripel pfixTripel = new Tripel(path, null);

                if (partsKnownByPustefix.contains(pfixTripel)) {

                    // checkTs
                    File f = new File(GlobalConfig.getDocroot(), currentdoc.get(PreDoc.FILENAME));
                    if (f.lastModified() != DateField.stringToTime(currentdoc.get(PreDoc.LASTTOUCH))) {
                        // ts differs
                        pfixTripel.setType(Tripel.Type.INSERT);
                        LOG.debug("TS differs: " + pfixTripel);
                        newDocs++;
                        if (!tripelsToIndex.add(pfixTripel)) {
                            LOG.debug("duplicated insert " + pfixTripel);
                        }
                    }
                    partsKnownByPustefix.remove(pfixTripel);
                } else {
                    // part not needed anymore
                    Tripel newTripel = new Tripel(currentdoc.get(PreDoc.PATH), Tripel.Type.DELETE);
                    deleteDocs++;
                    queue.queue(newTripel);
                }

            }
            stopIndexLoop = System.currentTimeMillis();

            // now partsKnownByPustefix only contains parts which are NOT indexed...
            startAddLoop = System.currentTimeMillis();
            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                // LOG.debug("adding " + element + " to queue
                // (INDEX)");
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert " + element);
                }
                // queue.queue(element);
            }

            stopAddLoop = System.currentTimeMillis();
        }
    } catch (IOException ioe) {
        LOG.error("error reading index", ioe);
    }

    // its a treeset, it is already sorted :)
    // Collections.sort(tripelsToIndex);
    // Collections.
    for (Tripel tripel : tripelsToIndex) {
        queue.queue(tripel);
    }

    stopLoop = System.currentTimeMillis();
    long needed = stopLoop - startLoop;
    if (newDocs != 0 || deleteDocs != 0) {
        LOG.debug(needed + "ms (getUsedTripels(): " + collectTime + "ms (" + knownDocsSize + "u) indexloop: "
                + (stopIndexLoop - startIndexLoop) + "|" + (stopAddLoop - startAddLoop) + "ms (" + numDocs
                + "u), added " + newDocs + "+" + deleteDocs + " queueitems");
    }

    try {
        if (reader != null) {
            reader.close();
            reader = null;
        }
    } catch (IOException e) {
        LOG.error("error while closing reader", e);
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexerTest.java

License:Apache License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir0 = IndexReader.open(FSDirectory.open(targetIndex0));
    IndexReader ir1 = IndexReader.open(FSDirectory.open(targetIndex1));
    Assert.assertEquals("Number of documents", 3, ir0.numDocs() + ir1.numDocs());

    Document doc = ir0.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir0.close();/* w  ww . j  a  va  2 s .  c  o  m*/
    ir1.close();

    // Search on the index
    Finder f = new Finder(index, jWeb1T);

    Assert.assertEquals(f.find("relax").size(), 3);
    Assert.assertEquals(f.find("couch").size(), 1);
    Assert.assertEquals(f.find("relax couch").size(), 1);
    Assert.assertEquals(f.find("couchdb").size(), 1);
}

From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java

License:Open Source License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex));
    Assert.assertEquals("Number of documents", 2, ir.numDocs());
    Document doc = ir.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir.close();/*from w w  w. ja  v a 2s  .c om*/

    // Search on the index
    IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex));
    QueryParser p = new QueryParser(Version.LUCENE_30, "token", new StandardAnalyzer(Version.LUCENE_30));
    Query q = p.parse("gram:relax");
    Assert.assertEquals("Hit count 'Relax'", 2, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:relax AND gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couchdb");
    Assert.assertEquals("Hit count 'couchdb'", 1, searcher.search(q, 100).totalHits);
    searcher.close();
}

From source file:de.unihildesheim.iw.cli.DumpIPCs.java

License:Open Source License

private void runMain(final String... args) throws IOException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;//from w w w  .j  a v a2  s. c o m
    }

    final Parser ipcParser = new Parser();
    ipcParser.separatorChar(this.cliParams.sep);
    ipcParser.allowZeroPad(this.cliParams.zeroPad);

    final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath()));
    final Builder idxReaderBuilder = new Builder(reader);

    Pattern rx_ipc = null;

    if (this.cliParams.ipc != null) {
        final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc);
        final BooleanQuery bq = new BooleanQuery();
        rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep));
        if (LOG.isDebugEnabled()) {
            LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc);
        }

        bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST);
        bq.add(new QueryWrapperFilter(
                new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST);
        idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq));
    }

    final IndexReader idxReader = idxReaderBuilder.build();

    if (idxReader.numDocs() > 0) {
        final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC);
        TermsEnum termsEnum = TermsEnum.EMPTY;
        BytesRef term;
        if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            term = termsEnum.next();

            final int[] count = { 0, 0 }; // match, exclude
            while (term != null) {
                final String code = term.utf8ToString();
                if (rx_ipc == null || (rx_ipc.matcher(code).matches())) {
                    final IPCRecord record = ipcParser.parse(code);
                    try {
                        System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '['
                                + record.toRegExpString('-') + ']');
                    } catch (final IllegalArgumentException e) {
                        System.out.println(code + ' ' + "INVALID (" + code + ')');
                    }
                    count[0]++;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skip non matching IPC: {}", code);
                    }
                    count[1]++;
                }
                term = termsEnum.next();
            }
            LOG.info("match={} skip={}", count[0], count[1]);
        }
    } else {
        LOG.info("No documents left after filtering.");
    }
}

From source file:dk.defxws.fedoragsearch.server.Config.java

License:Open Source License

private void checkConfig() throws ConfigException {

    if (logger.isDebugEnabled())
        logger.debug("fedoragsearch.properties=" + fgsProps.toString());

    //     Check for unknown properties, indicating typos or wrong property names
    String[] propNames = { "fedoragsearch.deployFile", "fedoragsearch.soapBase", "fedoragsearch.soapUser",
            "fedoragsearch.soapPass", "fedoragsearch.defaultNoXslt",
            "fedoragsearch.defaultGfindObjectsRestXslt", "fedoragsearch.defaultUpdateIndexRestXslt",
            "fedoragsearch.defaultBrowseIndexRestXslt", "fedoragsearch.defaultGetRepositoryInfoRestXslt",
            "fedoragsearch.defaultGetIndexInfoRestXslt", "fedoragsearch.mimeTypes", "fedoragsearch.maxPageSize",
            "fedoragsearch.defaultBrowseIndexTermPageSize", "fedoragsearch.defaultGfindObjectsHitPageSize",
            "fedoragsearch.defaultGfindObjectsSnippetsMax", "fedoragsearch.defaultGfindObjectsFieldMaxLength",
            "fedoragsearch.repositoryNames", "fedoragsearch.indexNames", "fedoragsearch.updaterNames",
            "fedoragsearch.searchResultFilteringModule", "fedoragsearch.searchResultFilteringType" };
    //checkPropNames("fedoragsearch.properties", fgsProps, propNames);

    //     Check rest stylesheets
    checkRestStylesheet("fedoragsearch.defaultNoXslt");
    checkRestStylesheet("fedoragsearch.defaultGfindObjectsRestXslt");
    checkRestStylesheet("fedoragsearch.defaultUpdateIndexRestXslt");
    checkRestStylesheet("fedoragsearch.defaultBrowseIndexRestXslt");
    checkRestStylesheet("fedoragsearch.defaultGetRepositoryInfoRestXslt");
    checkRestStylesheet("fedoragsearch.defaultGetIndexInfoRestXslt");

    //     Check mimeTypes  
    checkMimeTypes("fedoragsearch", fgsProps, "fedoragsearch.mimeTypes");

    //     Check resultPage properties
    try {//w  w w .ja va2 s .  co m
        maxPageSize = Integer.parseInt(fgsProps.getProperty("fedoragsearch.maxPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** maxPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultBrowseIndexTermPageSize = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultBrowseIndexTermPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultBrowseIndexTermPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsHitPageSize = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsHitPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsHitPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsSnippetsMax = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsSnippetsMax"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsSnippetsMax is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsFieldMaxLength = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsFieldMaxLength"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsFieldMaxLength is not valid:\n" + e.toString());
    }

    // Check updater properties
    String updaterProperty = fgsProps.getProperty("fedoragsearch.updaterNames");
    if (updaterProperty == null) {
        updaterNameToProps = null; // No updaters will be created
    } else {
        updaterNameToProps = new Hashtable();
        StringTokenizer updaterNames = new StringTokenizer(updaterProperty);
        while (updaterNames.hasMoreTokens()) {
            String updaterName = updaterNames.nextToken();
            try {
                InputStream propStream = null;
                try {
                    propStream = getResourceInputStream("/updater/" + updaterName + "/updater.properties");
                } catch (ConfigException e) {
                    errors.append("\n" + e.getMessage());
                }
                Properties props = new Properties();
                props.load(propStream);
                propStream.close();

                //MIH
                convertProperties(props);
                if (logger.isInfoEnabled()) {
                    logger.info(
                            configName + "/updater/" + updaterName + "/updater.properties=" + props.toString());
                }

                // Check properties
                String propsNamingFactory = props.getProperty("java.naming.factory.initial");
                String propsProviderUrl = props.getProperty("java.naming.provider.url");
                String propsConnFactory = props.getProperty("connection.factory.name");
                String propsClientId = props.getProperty("client.id");

                if (propsNamingFactory == null) {
                    errors.append("\n*** java.naming.factory.initial not provided in " + configName
                            + "/updater/" + updaterName + "/updater.properties");
                }
                if (propsProviderUrl == null) {
                    errors.append("\n*** java.naming.provider.url not provided in " + configName + "/updater/"
                            + updaterName + "/updater.properties");
                }
                if (propsConnFactory == null) {
                    errors.append("\n*** connection.factory.name not provided in " + configName + "/updater/"
                            + updaterName + "/updater.properties");
                }
                if (propsClientId == null) {
                    errors.append("\n*** client.id not provided in " + configName + "/updater/" + updaterName
                            + "/updater.properties");
                }

                updaterNameToProps.put(updaterName, props);
            } catch (IOException e) {
                errors.append("\n*** Error loading " + configName + "/updater/" + updaterName + ".properties:\n"
                        + e.toString());
            }
        }
    }

    // Check searchResultFilteringModule property
    searchResultFilteringModuleProperty = fgsProps.getProperty("fedoragsearch.searchResultFilteringModule");
    if (searchResultFilteringModuleProperty != null && searchResultFilteringModuleProperty.length() > 0) {
        try {
            getSearchResultFiltering();
        } catch (ConfigException e) {
            errors.append(e.getMessage());
        }
        String searchResultFilteringTypeProperty = fgsProps
                .getProperty("fedoragsearch.searchResultFilteringType");
        StringTokenizer srft = new StringTokenizer("");
        if (searchResultFilteringTypeProperty != null) {
            srft = new StringTokenizer(searchResultFilteringTypeProperty);
        }
        int countTokens = srft.countTokens();
        if (searchResultFilteringTypeProperty == null || countTokens == 0 || countTokens > 1) {
            errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType="
                    + searchResultFilteringTypeProperty
                    + ": one and only one of 'presearch', 'insearch', 'postsearch' must be stated.\n");
        } else {
            for (int i = 0; i < countTokens; i++) {
                String token = srft.nextToken();
                if (!("presearch".equals(token) || "insearch".equals(token) || "postsearch".equals(token))) {
                    errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType="
                            + searchResultFilteringTypeProperty
                            + ": only 'presearch', 'insearch', 'postsearch' may be stated, not '" + token
                            + "'.\n");
                }
            }
        }
    }

    //     Check repository properties
    Enumeration repositoryNames = repositoryNameToProps.keys();
    while (repositoryNames.hasMoreElements()) {
        String repositoryName = (String) repositoryNames.nextElement();
        Properties props = (Properties) repositoryNameToProps.get(repositoryName);
        if (logger.isDebugEnabled())
            logger.debug(configName + "/repository/" + repositoryName + "/repository.properties="
                    + props.toString());

        //        Check for unknown properties, indicating typos or wrong property names
        String[] reposPropNames = { "fgsrepository.repositoryName", "fgsrepository.fedoraSoap",
                "fgsrepository.fedoraUser", "fgsrepository.fedoraPass", "fgsrepository.fedoraObjectDir",
                "fgsrepository.fedoraVersion", "fgsrepository.defaultGetRepositoryInfoResultXslt",
                "fgsrepository.trustStorePath", "fgsrepository.trustStorePass" };
        //checkPropNames(configName+"/repository/"+repositoryName+"/repository.properties", props, reposPropNames);

        //        Check repositoryName
        String propsRepositoryName = props.getProperty("fgsrepository.repositoryName");
        if (!repositoryName.equals(propsRepositoryName)) {
            errors.append("\n*** " + configName + "/repository/" + repositoryName
                    + ": fgsrepository.repositoryName must be=" + repositoryName);
        }

        //        Check fedoraObjectDir
        //          String fedoraObjectDirName = insertSystemProperties(props.getProperty("fgsrepository.fedoraObjectDir"));
        //          File fedoraObjectDir = new File(fedoraObjectDirName);
        //          if (fedoraObjectDir == null) {
        //             errors.append("\n*** "+configName+"/repository/" + repositoryName
        //                   + ": fgsrepository.fedoraObjectDir="
        //                   + fedoraObjectDirName + " not found");
        //          }

        //        Check result stylesheets
        checkResultStylesheet("/repository/" + repositoryName, props,
                "fgsrepository.defaultGetRepositoryInfoResultXslt");
    }

    //     Check index properties
    Enumeration indexNames = indexNameToProps.keys();
    while (indexNames.hasMoreElements()) {
        String indexName = (String) indexNames.nextElement();
        Properties props = (Properties) indexNameToProps.get(indexName);
        if (logger.isDebugEnabled())
            logger.debug(configName + "/index/" + indexName + "/index.properties=" + props.toString());

        //        Check for unknown properties, indicating typos or wrong property names
        String[] indexPropNames = { "fgsindex.indexName", "fgsindex.indexBase", "fgsindex.indexUser",
                "fgsindex.indexPass", "fgsindex.operationsImpl", "fgsindex.defaultUpdateIndexDocXslt",
                "fgsindex.defaultUpdateIndexResultXslt", "fgsindex.defaultGfindObjectsResultXslt",
                "fgsindex.defaultBrowseIndexResultXslt", "fgsindex.defaultGetIndexInfoResultXslt",
                "fgsindex.indexDir", "fgsindex.analyzer", "fgsindex.untokenizedFields",
                "fgsindex.defaultQueryFields", "fgsindex.snippetBegin", "fgsindex.snippetEnd",
                "fgsindex.maxBufferedDocs", "fgsindex.mergeFactor", "fgsindex.ramBufferSizeMb",
                "fgsindex.defaultWriteLockTimeout", "fgsindex.defaultSortFields", "fgsindex.uriResolver" };
        //checkPropNames(configName+"/index/"+indexName+"/index.properties", props, indexPropNames);

        //        Check indexName
        String propsIndexName = props.getProperty("fgsindex.indexName");
        if (!indexName.equals(propsIndexName)) {
            errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.indexName must be="
                    + indexName);
        }

        //        Check operationsImpl class
        String operationsImpl = props.getProperty("fgsindex.operationsImpl");
        if (operationsImpl == null || operationsImpl.equals("")) {
            errors.append(
                    "\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl must be set in "
                            + configName + "/index/ " + indexName + ".properties");
        }
        try {
            Class operationsImplClass = Class.forName(operationsImpl);
            try {
                GenericOperationsImpl ops = (GenericOperationsImpl) operationsImplClass
                        .getConstructor(new Class[] {}).newInstance(new Object[] {});
            } catch (InstantiationException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (IllegalAccessException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (InvocationTargetException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (NoSuchMethodException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            }
        } catch (ClassNotFoundException e) {
            errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                    + operationsImpl + ": class not found.\n" + e);
        }

        //        Check result stylesheets
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexDocXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGfindObjectsResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultBrowseIndexResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGetIndexInfoResultXslt");

        //        Check indexDir
        String indexDir = insertSystemProperties(props.getProperty("fgsindex.indexDir"));
        File indexDirFile = new File(indexDir);
        if (indexDirFile == null) {
            errors.append("\n*** " + configName + "/index/" + indexName + " fgsindex.indexDir=" + indexDir
                    + " must exist as a directory");
        }

        //        Check analyzer class for lucene and solr
        if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) {
            String analyzer = props.getProperty("fgsindex.analyzer");
            if (analyzer == null || analyzer.equals("")) {
                analyzer = defaultAnalyzer;
            }
            try {
                Class analyzerClass = Class.forName(analyzer);
                try {
                    Analyzer a = (Analyzer) analyzerClass.getConstructor(new Class[] {})
                            .newInstance(new Object[] {});
                } catch (InstantiationException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (IllegalAccessException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (InvocationTargetException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (NoSuchMethodException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error:\n" + e.toString());
                }
            } catch (ClassNotFoundException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.analyzer=" + analyzer
                        + ": class not found:\n" + e.toString());
            }
        }

        //        Add untokenizedFields property for lucene
        if (operationsImpl.indexOf("fgslucene") > -1) {
            String defaultUntokenizedFields = props.getProperty("fgsindex.untokenizedFields");
            if (defaultUntokenizedFields == null)
                props.setProperty("fgsindex.untokenizedFields", "");
            if (indexDirFile != null) {
                StringBuffer untokenizedFields = new StringBuffer(
                        props.getProperty("fgsindex.untokenizedFields"));
                IndexReader ir = null;
                try {
                    ir = IndexReader.open(FSDirectory.open(new File(indexDir)), true);
                    int max = ir.numDocs();
                    if (max > 10)
                        max = 10;
                    for (int i = 0; i < max; i++) {
                        Document doc = ir.document(i);
                        for (ListIterator li = doc.getFields().listIterator(); li.hasNext();) {
                            Field f = (Field) li.next();
                            if (!f.isTokenized() && f.isIndexed() && untokenizedFields.indexOf(f.name()) < 0) {
                                untokenizedFields.append(" " + f.name());
                            }
                        }
                    }
                } catch (Exception e) {
                }
                props.setProperty("fgsindex.untokenizedFields", untokenizedFields.toString());
                if (logger.isDebugEnabled())
                    logger.debug("indexName=" + indexName + " fgsindex.untokenizedFields=" + untokenizedFields);
            }
        }

        //        Check defaultQueryFields - how can we check this?
        String defaultQueryFields = props.getProperty("fgsindex.defaultQueryFields");

        //        Use custom URIResolver if given
        //MIH: also check for solr
        if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) {
            Class uriResolverClass = null;
            String uriResolver = props.getProperty("fgsindex.uriResolver");
            if (!(uriResolver == null || uriResolver.equals(""))) {
                try {
                    uriResolverClass = Class.forName(uriResolver);
                    try {
                        URIResolverImpl ur = (URIResolverImpl) uriResolverClass.getConstructor(new Class[] {})
                                .newInstance(new Object[] {});
                        if (ur != null) {
                            ur.setConfig(this);
                            indexNameToUriResolvers.put(indexName, ur);
                        }
                    } catch (InstantiationException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (IllegalAccessException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (InvocationTargetException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (NoSuchMethodException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error:\n"
                                + e.toString());
                    }
                } catch (ClassNotFoundException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.uriResolver="
                            + uriResolver + ": class not found:\n" + e.toString());
                }
            }
        }
    }
    if (logger.isDebugEnabled())
        logger.debug("configCheck configName=" + configName + " errors=" + errors.toString());
    if (errors.length() > 0)
        throw new ConfigException(errors.toString());
}

From source file:drakkar.mast.retrieval.SVNContext.java

/**
 * {@inheritDoc}//from  w  ww.  ja  v  a 2  s.c om
 */
public boolean loadIndex(File indexPath) throws IOException, IndexException {
    IndexReader reader = null;
    boolean flag = false;

    if (!indexPath.isDirectory() || !indexPath.exists() || indexPath == null
            || IndexReader.indexExists(FSDirectory.open(indexPath)) == false) {
        message = "Not found index in default index path";
        OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
        throw new IndexException(message);

    } else {

        reader = IndexReader.open(FSDirectory.open(indexPath));
        loadedDocs = reader.numDocs();
        reader.close();

        message = "Loading SVN index...";
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            message = "Error loading index: " + ex.toString();
            OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
            this.notifyTaskProgress(ERROR_MESSAGE, message);
        }
        message = "Total of documents of the index: " + loadedDocs;
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        flag = true;
        this.notifyLoadedDocument(loadedDocs);

    }

    return flag;
}

From source file:drakkar.mast.retrieval.SVNContext.java

/**
 * {@inheritDoc}/*from   ww  w.j  a v  a  2  s. c  om*/
 */
public boolean loadIndex() throws IndexException, IOException {
    IndexReader reader = null;
    File defaultFile = new File(this.defaultIndexPath);

    boolean flag = false;

    if (!defaultFile.isDirectory() || !defaultFile.exists() || defaultFile == null
            || IndexReader.indexExists(FSDirectory.open(defaultFile)) == false) {
        message = "Not found index in default index path";
        OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
        throw new IndexException(message);

    } else {

        reader = IndexReader.open(FSDirectory.open(defaultFile));
        loadedDocs = reader.numDocs();
        reader.close();

        message = "Loading SVN index...";
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            message = "Error loading index: " + ex.toString();
            OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
            this.notifyTaskProgress(ERROR_MESSAGE, message);
        }
        message = "Total of documents of the index: " + loadedDocs;
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        flag = true;
        this.notifyLoadedDocument(loadedDocs);

    }

    return flag;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java

License:Open Source License

/**
 *
 * Extracts the top n most frequent terms (by document frequency ) from an existing Lucene index
 * (the dir must be specified via args or via tfidfTester on
 * /resources/lite/configs/general.conf) in this case the Wikipedia corpus) and reports their
 * document frequency./*from w  w w  . j a va 2  s  . co  m*/
 *
 * @param args
 * @throws java.lang.Exception
 */
public static void main(String[] args) throws Exception {
    try {
        Properties prop = new Properties();
        InputStream is = new FileInputStream("resources/lite/configs/general.conf");
        FSDirectory dir;
        if (args.length == 1) {
            if (Paths.get(args[0]).toFile().isDirectory()) {
                dir = FSDirectory.open(new File(args[0]));
            } else {
                System.out.println("The specified directory does not exist\n"
                        + " backing to load the lucene index specified in the config files");
                dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
            }
        } else if (args.length > 1) {
            System.out.println("The args only need one parameter, the directory of the Lucene Index\n "
                    + "backing to load the lucene index specified in the config files");
            dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
        } else {
            dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
        }
        IndexReader reader = null;
        String field = null;
        boolean IncludeTermFreqs = false;
        prop.load(is);
        IncludeTermFreqs = true;
        reader = DirectoryReader.open(dir);
        System.out.println("num Docs " + reader.numDocs());
        TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
        if (!IncludeTermFreqs) {
            //default HighFreqTerms behavior
            for (int i = 0; i < terms.length; i++) {
                System.out.printf("%s:%s %,d \n", terms[i].field, terms[i].termtext.utf8ToString(),
                        terms[i].docFreq);
            }
        } else {
            TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
            for (int i = 0; i < termsWithTF.length; i++) {
                System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", termsWithTF[i].field,
                        termsWithTF[i].termtext.utf8ToString(), termsWithTF[i].totalTermFreq,
                        termsWithTF[i].docFreq);
            }
        }
        reader.close();
    } catch (Exception ex) {
        logger.error("The directory specified contains a Lucene index?", ex);
    }
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java

License:Open Source License

/**
 * uses custom similarity to compute idf, use this if you want to implement
 * IDF(numDocs,docFreq)// ww w  . j  a  va2  s .  com
 * 
 * @param reader
 * @param field
 * @param tfidfSIM
 * @return
 * @throws IOException
 */
public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM)
        throws IOException {
    Map<String, Float> docFrequencies = new HashMap<>();

    TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator();
    BytesRef bytesRef;
    while ((bytesRef = termEnum.next()) != null) {
        if (termEnum.seekExact(bytesRef)) {
            String term = bytesRef.utf8ToString();

            float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs());
            docFrequencies.put(term, idf);
        }
    }

    return docFrequencies;
}