Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:edu.utah.bmi.ibiomes.catalog.MetadataLookup.java

License:Open Source License

/**
 * Get all standard attributes from the dictionary
 * @return List of standard metadata attributes
 * @throws IOException /*  www.  j  a v  a 2s  .  c o  m*/
 * @throws CorruptIndexException 
 */
public MetadataAttributeList getAllMetadataAttributes() throws CorruptIndexException, IOException {
    logger.info("Loading list of standard metadata attributes");
    MetadataAttributeList attrs = new MetadataAttributeList();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
    for (int d = 0; d < reader.numDocs(); d++) {
        Document doc = reader.document(d);
        MetadataAttribute attribute = getAttributeFromDocument(doc);
        attrs.add(attribute);
    }
    return attrs;
}

From source file:edu.utsa.sifter.IndexResource.java

License:Apache License

@Path("index")
@POST/*from w  w w .  java  2  s .c  o  m*/
@Consumes({ MediaType.APPLICATION_JSON })
@Produces({ MediaType.APPLICATION_JSON })
public IndexInfo openIndex(IndexInfo idx) {
    if (idx.Id == null) {
        idx.Id = new String(Hex.encodeHex(Hasher.digest(idx.Path.getBytes())));
    }
    idx.Id = idx.Id.toLowerCase();

    IndexReader rdr = State.Indices.get(idx.Id);
    if (rdr == null) {
        try {
            final File evPath = new File(idx.Path);
            final File primaryIdx = new File(evPath, "primary-idx");
            final File somIdx = new File(evPath, "som-idx");
            DirectoryReader parallel[] = new DirectoryReader[2];
            parallel[0] = DirectoryReader.open(FSDirectory.open(primaryIdx));
            parallel[1] = DirectoryReader.open(FSDirectory.open(somIdx));

            rdr = new ParallelCompositeReader(parallel);
        } catch (IOException ex) {
            HttpResponse.setStatus(HttpServletResponse.SC_NOT_FOUND);
        }
    }
    if (rdr != null) {
        idx.NumDocs = rdr.numDocs();

        State.Indices.put(idx.Id, rdr);
        State.IndexLocations.put(idx.Id, idx);
    }
    return idx;
}

From source file:edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java

License:Apache License

@SuppressWarnings("deprecation")
public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath,
        String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms,
        boolean append) throws Exception {

    String index = indexDirectoryPath;
    FileWriter f = new FileWriter(index + "../NotFound.txt", true);

    for (int i = 0; i < targetClasses.length; i++) {
        String target = targetClasses[i];
        boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target);
        if (!found)
            f.append("Target doc " + i + " - " + target + " not found in index!\n");
    }/*from   www . j ava2  s  .  c o m*/
    f.close();
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true);

    int numDocs = reader.numDocs();
    System.out.println("The number of documents in the index is: " + numDocs);

    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

    String[] fields;
    fields = new String[1];
    fields[0] = "contents";

    if (!runIndividualTerms) {
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
        int hitsPerPage = numDocs;
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        Query query = parser.parse(queryString);
        searcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        System.out.println("The number of hits is: " + hits.length);

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...)
        FileWriter fwRelevant = new FileWriter(fileOutput, append);

        String path = "";
        String docName = "";
        String docPathAndName = "";
        for (String target : targetClasses) {
            boolean found = false;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                path = d.get("path");

                float score = hits[i].score;

                if (documentType == 2) {
                    docName = d.get("docName");

                    docPathAndName = path.toLowerCase() + "." + docName.toLowerCase();

                    if (target.equalsIgnoreCase(docPathAndName)) {
                        fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";"
                                + hits.length + ";" + numDocs + ";" + score + "\n");
                        found = true;
                        break;
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path.trim());
                    String fileName = pathDir.getName();
                    docName = fileName.replaceAll(".txt", "");
                    fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            if (found == false)
                fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n");

        }
        // fw.close();
        fwRelevant.close();
        reader.close();
    } else // runIndividualTerms = true
    {
        /**
         * each query will be divided in its constituent terms and each term
         * will be run as a separate query
         **/
        /**
         * this is useful to determine the similarity of each of the terms
         * in a query to a target document so that we determine which terms
         * in the query tend to lead to the best results, i.e., to finding
         * the targets sooner
         **/

        SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString,
                fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append);

        FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt"));
        fw.write(
                "\n\n\n------------------------------------------------------------------------------------\n\n");
        fw.write("                               Results for query " + queryNumber + "\n");
        fw.write("------------------------------------------------------------------------------------\n\n");

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...)
        // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...)
        // ...
        FileWriter fwRelevant = new FileWriter(
                fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt"));

        String[] queryTerms = queryString.split(" ");
        for (int l = 0; l < queryTerms.length; l++) {
            MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
            int hitsPerPage = numDocs;
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

            String q = queryTerms[l];
            Query query = parser.parse(q);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            fw.write("TERM " + (l + 1) + ": " + q + "\n\n");
            fwRelevant.write("\n" + queryNumber + "," + q);
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                String path = d.get("path");
                float score = hits[i].score;
                if (documentType == 2) {
                    String docName = d.get("docName");
                    fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n");
                    for (int k = 0; k < targetClasses.length; k++) {
                        if (docName.equalsIgnoreCase(targetClasses[k])) {
                            String contents = d.get("contents");
                            int frequency = countOccurrences(contents, q);// tf
                            fwRelevant.write("," + frequency);

                            fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df
                            fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score);
                            break;
                        }
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path);
                    String fileName = pathDir.getName();
                    String docName = fileName.replaceAll(".txt", "");
                    fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            fw.write("\n\n\n");
        }
        fw.close();
        f.close();
        fwRelevant.close();
        reader.close();
    }
}

From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java

License:Open Source License

/**
 * @see es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorService#busquedaAvanzada(es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO)
 * @param ParamAvanzadoVO VO que alberga los parametros que acepta una busqueda avanzada.
 * @return DocumentosVO Esta clase representa los resultados de una busqueda. 
 *//* ww w .ja v  a2  s  . c o  m*/
protected es.pode.indexador.negocio.servicios.busqueda.DocumentosVO handleBusquedaAvanzada(
        es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO paramBusq) throws java.lang.Exception {

    //      Implementamos la busqueda avanzada.
    DocumentosVO respuesta = new DocumentosVO();
    if (logger.isDebugEnabled())
        logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada: Parametros de la busqueda avanzada "
                + paramBusqAvanz2String(paramBusq));
    DisjunctionMaxQuery query = new DisjunctionMaxQuery(0.01f);
    long start = System.currentTimeMillis();
    Object[] hits = null;
    boolean resultadoUnico = false;

    // Some fixing with "*" and ".*"
    if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("*")
            && !paramBusq.getPalabrasClave().trim().equals(".*")) {
        if (paramBusq.getPalabrasClave().trim().toLowerCase()
                .startsWith(props.getProperty("agrega_admin") + " ")) {
            hits = new Hits[1];
            hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim()
                    .substring((props.getProperty("agrega_admin") + " ").length()), query, false, null);
        } else if (paramBusq.getPalabrasClave().trim().toLowerCase()
                .startsWith(props.getProperty("agrega_todos") + " ")) {
            IndiceVO[] idiomas = this.handleObtenerIdiomasBusqueda();
            hits = new Hits[idiomas.length];
            hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim()
                    .substring((props.getProperty("agrega_todos") + " ").length()), query, true, idiomas);
        } else {
            resultadoUnico = true;
            hits = new Hits[1];
            hits[0] = internaBusquedaAvanzada(paramBusq, query);
        }
    } else {
        resultadoUnico = true;
        hits = new Hits[1];
        hits[0] = internaBusquedaAvanzada(paramBusq, query);
    }
    long end = System.currentTimeMillis();
    logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en ="
            + (end - start) + " milisegundos.");
    if (logger.isDebugEnabled()) {
        logger.debug("Se devuelven los hits");
        logger.debug("Se devuelven los hits NULL o 0 vamos a por las sugerencias");
    }
    if (paramBusq.getBusquedaSimpleAvanzada() != null
            && !paramBusq.getBusquedaSimpleAvanzada().equals(BUSCARRSS)) {
        try {
            Directory directorioIndiceSpell = this.indiceSpell(paramBusq.getIdiomaBusqueda());
            if (logger.isDebugEnabled())
                logger.debug("El indice de spellchecker es " + directorioIndiceSpell);
            SpellChecker spellChecker = new SpellChecker(directorioIndiceSpell);
            String sQuery = query.toString();
            //         TODO: retocar las sugerencias para que tengan en cuenta algo mas que los parametros de "keywords"?
            if (!spellChecker.exist(sQuery)
                    && (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().equals(""))) {
                List claveBusqueda = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(),
                        false);
                List<String> sugerencias = new ArrayList<String>();
                if (claveBusqueda != null && claveBusqueda.size() > 0) {
                    boolean suficientes = false;
                    for (int i = 0; i < claveBusqueda.size() && !suficientes; i++) {
                        if (!((String) claveBusqueda.get(i)).equals("")) {
                            String[] suge = spellChecker.suggestSimilar((String) claveBusqueda.get(i),
                                    NUMERO_SUGERENCIAS);
                            if (suge != null && suge.length > 0) {
                                for (int k = 0; k < suge.length
                                        && sugerencias.size() < NUMERO_SUGERENCIAS; k++) {
                                    boolean encontrado = false;
                                    for (int j = 0; j < sugerencias.size() && !encontrado; j++) {
                                        if (sugerencias.get(j).toString().equals(suge[k]))
                                            encontrado = true;
                                    }
                                    if (!encontrado && validarPersonalizada(paramBusq)) {
                                        Hits hitSugerencias = null;
                                        ParamAvanzadoVO paramBusqSug = paramBusq;
                                        paramBusqSug.setPalabrasClave(suge[k]);
                                        try {
                                            hitSugerencias = internaBusquedaAvanzada(paramBusqSug, query);
                                            if (hitSugerencias != null && hitSugerencias.length() > 0)
                                                sugerencias.add(suge[k]);
                                        } catch (Exception e) {
                                            logger.error(
                                                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando comprobaci n sugerencia avanzada. Sugerencia="
                                                            + suge[k],
                                                    e);
                                        }
                                    } else if (!encontrado && !validarPersonalizada(paramBusq))
                                        sugerencias.add(suge[k]);
                                }
                            }
                            if (sugerencias.size() == NUMERO_SUGERENCIAS)
                                suficientes = true;
                        }
                    }
                }
                String[] cargarSugerencias = new String[] {};
                if (sugerencias != null && sugerencias.size() > 0) {
                    cargarSugerencias = new String[sugerencias.size()];
                    for (int i = 0; i < sugerencias.size(); i++) {
                        cargarSugerencias[i] = sugerencias.get(i);
                    }
                }
                respuesta.setSugerencias(cargarSugerencias);
            } else
                respuesta.setSugerencias(new String[] {});
        } catch (Exception e) {
            logger.error(
                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando sugerencias para idioma:"
                            + paramBusq.getIdiomaBusqueda(),
                    e);
            respuesta.setSugerencias(new String[] {});
        }
        try {
            es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {};
            if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("")) {
                List palabrasTesauro = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(),
                        true);
                List<String> nombreTesauros = new ArrayList<String>();
                List<String> identificadorTesauros = new ArrayList<String>();
                if (palabrasTesauro != null && palabrasTesauro.size() > 0) {
                    int numeroTax = 0;
                    for (int i = 0; i < palabrasTesauro.size()
                            && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); i++) {
                        TaxonVO[] taxones = this.getSrvTesaurosServices().obtenerTerminosRelacionadosPorTexto(
                                (String) palabrasTesauro.get(i), props.getProperty("nombre_tesauro"),
                                paramBusq.getIdiomaBusqueda());
                        String[] idTesauro = new String[taxones.length];
                        for (int k = 0; k < taxones.length; k++) {
                            idTesauro[k] = taxones[k].getId();
                        }
                        for (int k = 0; k < taxones.length
                                && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); k++) {
                            Integer[] tesauros = NumTermsArbol
                                    .obtenerNumeroNodos(idTesauro,
                                            getIndexPathByLanguage(paramBusq.getIdiomaBusqueda()), "tesauro")
                                    .getConteo();
                            if (idTesauro != null && idTesauro.length != 0) {
                                for (int j = 0; j < idTesauro.length; j++) {
                                    if (idTesauro[j].equals(taxones[k].getId())) {
                                        if (tesauros[j].intValue() > 0) {
                                            nombreTesauros.add(taxones[k].getValorTax());
                                            identificadorTesauros.add(taxones[k].getId());
                                            numeroTax = numeroTax + 1;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                if (nombreTesauros != null && nombreTesauros.size() > 0) {
                    cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[nombreTesauros
                            .size()];
                    for (int i = 0; i < nombreTesauros.size(); i++) {
                        cargarTesauros[i] = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO(
                                identificadorTesauros.get(i).toString(), nombreTesauros.get(i).toString());
                    }
                }
                respuesta.setTesauros(cargarTesauros);
            } else
                respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {});
        } catch (Exception e) {
            logger.error(
                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error obteniendo sugerencias de tesauro ["
                            + props.getProperty("nombre_tesauro") + "] con:" + paramBusq.getPalabrasClave()
                            + " n mero de tesauros m ximo solicitado=" + props.getProperty("numero_tesauros")
                            + " e idioma=" + paramBusq.getIdiomaBusqueda(),
                    e);
            respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {});
        }
    }
    if (hits == null || (resultadoUnico && hits[0] == null)) {
        respuesta.setTotalResultados(new Integer(0));
        respuesta.setNumeroResultados(new Integer(0));
        respuesta.setNumDocumentosIndice(new Integer(0));
    } else {
        long start2 = System.currentTimeMillis();
        if (hits.length > 1) {
            int totalResultados = 0;
            List docsList = new ArrayList();
            for (int i = 0; i < hits.length
                    && docsList.size() <= paramBusq.getNumeroResultados().intValue(); i++) {
                if (hits[i] != null && ((Hits) hits[i]).length() > 0) {
                    totalResultados = totalResultados + ((Hits) hits[i]).length();
                    DocVO[] docs = this.getArrayDocsFromHits((Hits) hits[i],
                            ((((Hits) hits[i]).length() < paramBusq.getNumeroResultados().intValue())
                                    || paramBusq.getNumeroResultados().intValue() == -1)
                                            ? ((Hits) hits[i]).length()
                                            : paramBusq.getNumeroResultados().intValue());
                    for (int j = 0; j < docs.length; j++) {
                        docsList.add(docs[j]);
                    }
                }
            }
            DocVO[] docs = new DocVO[docsList.size()];
            for (int i = 0; i < docs.length; i++) {
                docs[i] = (DocVO) docsList.get(i);
            }
            respuesta.setTotalResultados(new Integer(totalResultados));
            respuesta.setResultados(docs);
        } else {
            respuesta.setTotalResultados(new Integer(((Hits) hits[0]).length()));
            respuesta.setResultados(this.getArrayDocsFromHits((Hits) hits[0],
                    ((((Hits) hits[0]).length() < paramBusq.getNumeroResultados().intValue())
                            || paramBusq.getNumeroResultados().intValue() == -1) ? ((Hits) hits[0]).length()
                                    : paramBusq.getNumeroResultados().intValue()));
        }
        end = System.currentTimeMillis();
        logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Mapeo local realizado en ="
                + (end - start2) + " milisegundos.");
        IndexReader reader = IndexReader.open(this.getIndexByLanguage(paramBusq.getIdiomaBusqueda()));
        respuesta.setNumDocumentosIndice(new Integer(reader.numDocs()));
        respuesta.setNumeroResultados(new Integer(respuesta.getResultados().length));
    }
    logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en ="
            + (end - start) + " milisegundos.");
    return respuesta;
}

From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java

License:Open Source License

/**
 * Este metodo busca un ODE al azar de dentro del repositorio.
 * @return DocVO Detalle de un ODE indexado.
 *//* ww  w. j  a va2  s. c om*/
protected DocVO handleObtenerODERandom() throws Exception {
    List listaIndices = (List) this.getIndiceDao().loadAll(getIndiceDao().TRANSFORM_INDICEVO);
    if (listaIndices.size() == 0)//      No hay indices que listar, no devuelvo nada
        return null;
    Random random = new Random(Calendar.getInstance().getTimeInMillis());
    Document doc = null;
    boolean noCero = true;
    int intRandom = random.nextInt();
    int i = 0;
    int reintentosInt = 10; //Puede que intRandom sea 0, para que no sea as  haremos 10 intentos como mucho
    for (i = 0; i < reintentosInt && intRandom != 0 && noCero; i++) {

        //      Sacamos el indice aleatoriamente de todos los indices del repositorio
        int idiomaSeleciconado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % listaIndices.size();
        IndiceVO indice = (IndiceVO) listaIndices.get(idiomaSeleciconado);
        //      Abrimos el indice y vemos el numero de documentos indexados
        Directory directorioIndiceSimple = null;
        directorioIndiceSimple = this.getIndexByLanguage(indice.getIdentificador());
        IndexReader indiceLectura = IndexReader.open(directorioIndiceSimple);
        int numeroDocumentos = indiceLectura.numDocs();
        logger.debug("El numero de documentos del indice es " + numeroDocumentos);
        //      Seleccionamos el documento que vamos a extraer
        if (numeroDocumentos > 0) {
            intRandom = random.nextInt();
            noCero = false;
            int documentoSeleccionado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % numeroDocumentos;
            logger.info("Devuelto documento [" + documentoSeleccionado + "] de [" + numeroDocumentos
                    + "] documentos totales indexados.");
            doc = indiceLectura.document(documentoSeleccionado);
        }
        indiceLectura.close();
    }
    if (i == reintentosInt && noCero) {
        logger.info("No se ha encontrado ning n random  v lido en [" + reintentosInt + "] intentos");
    }
    if (doc != null)
        return getVOFromLucene(doc, new DocVO(), 0);
    else
        return null;
}

From source file:eu.eexcess.sourceselection.redde.dbsampling.DBSampler.java

License:Apache License

/**
 * Estimates the database size of general database using sample-resample and
 * search term "term".//  w  ww.  j  a  v  a  2s  .  c o  m
 * 
 * @param term
 *            one-term search term for general and sampled index
 * @return the estimated database size of the general index
 * @throws ParseException
 * @throws IOException
 * @throws IllegalArgumentException
 *             if an index (base or sampled) contains no documents
 */
private double resample(String term) throws ParseException, IOException, IllegalArgumentException {

    Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer()).parse(term);

    outIndexWriter.commit();

    if (inIndexReader.numDocs() <= 0) {
        throw new IllegalArgumentException("failed to resample using empty index [inIndexReader]");
    } else if (outIndexWriter.numDocs() <= 0) {
        throw new IllegalArgumentException("failed to resample using empty index [outIndexWriter]");
    }

    double estimation = 0;
    IndexReader sampleIndexReader = null;

    try {
        // get total hits for term in sample index
        sampleIndexReader = DirectoryReader.open(outIndexWriter, true);
        IndexSearcher sampleIndexSearcher = new IndexSearcher(sampleIndexReader);
        TopDocs sampleSearchDocs = sampleIndexSearcher.search(query, sampleIndexReader.numDocs());

        // get total hits for term in general index
        IndexSearcher generalIndexSearcher = new IndexSearcher(inIndexReader);
        TopDocs generalSearchDocs = generalIndexSearcher.search(query, inIndexReader.numDocs());

        estimation = estimationCalculator(generalSearchDocs.totalHits, sampleSearchDocs.totalHits,
                sampleIndexReader.numDocs(), true);
    } finally {
        if (sampleIndexReader != null) {
            sampleIndexReader.close();
        }
    }

    return estimation;
}

From source file:eu.eexcess.sourceselection.redde.Redde.java

License:Apache License

private ScoreDoc[] scoreDatabase(String queryString, IndexReader database) throws ParseException, IOException {

    Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer())
            .parse(queryString);//from  w ww. j  av  a  2s  .c om
    IndexSearcher searcher = new IndexSearcher(database);
    TopDocs topDocs = searcher.search(query, database.numDocs());

    return topDocs.scoreDocs;
}

From source file:fr.ericlab.sondy.algo.eventdetection.EDCoW.java

License:Open Source License

@Override
public ObservableList<DetectionResult> apply() {
    try {/*w w  w  .java  2s  .  co m*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            delta = Integer.parseInt(parameters.get(2).getValue());
        }
        if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) {
            gamma = Integer.parseInt(parameters.get(3).getValue());
        }
        if (parameters.get(4).getValue() != null && !parameters.get(4).getValue().equals("")) {
            delta2 = Integer.parseInt(parameters.get(4).getValue());
        }
        long startNanoTime = System.nanoTime();
        int intervals = appVariables.messageSet.nbTimeSlice;
        int windows = intervals / delta2;
        events = new LinkedList<>();

        termDocsMap = new HashMap<>();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequencyf[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequencyf[r.numDocs()];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    termDocsMap.put(term, frequencyf);
                }
            }
        }
        indexAccess.close();
        for (int i = 0; i < windows; i++) {
            processWindow(i);
        }
        Collections.sort(events);
        results = FXCollections.observableArrayList();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        for (EDCoWEvent ev : events) {
            double interval[] = ev.getInterval(intervalDuration);
            results.add(new DetectionResult(ev.getKeywordsAsString(),
                    formatter.format(interval[0]) + ";" + formatter.format(interval[1])));
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed EDCoW, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", delta=" + delta + ", gamma=" + gamma + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (NumberFormatException | IOException ex) {
        Logger.getLogger(EDCoW.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:fr.ericlab.sondy.algo.eventdetection.ET.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {/*from ww  w .  j  ava2 s  . co  m*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            gamma = Double.parseDouble(parameters.get(2).getValue());
        }
        if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) {
            alpha = Double.parseDouble(parameters.get(3).getValue());
        }

        long startNanoTime = System.nanoTime();
        HashMap<String, Bursts> mapBursts = new HashMap<>();
        HashMap<String, LinkedList<String>> mapCooccurences = new HashMap<>();
        LinkedList<String> bigrams = new LinkedList<>();
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages);
        int maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);

        DBAccess dbAccess = new DBAccess();
        dbAccess.initialize(appVariables, false);
        DataManipulation dataManipulation = new DataManipulation();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        int intervalNumber = r.numDocs();
        TermEnum allTerms = r.terms();
        LinkedList<Frequency> frequencies = new LinkedList<>();
        while (allTerms.next()) {
            String k = allTerms.term().text();
            if (!appVariables.isStopWord(k)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    bigrams.add(k);
                    frequencies.add(new Frequency(k, frequency, cf));
                }
            }
        }
        HashSet<String> bigramSet = new HashSet<>();
        bigramSet.addAll(bigrams);

        // Pre-computing AF total in all time-slices
        double AFtotal[] = new double[intervalNumber];
        for (int a = 0; a < intervalNumber; a++) {
            double AFa = 0;
            for (int b = 0; b < bigrams.size(); b++) {
                AFa += frequencies.get(b).AF[a];
            }
            AFtotal[a] = AFa;
        }

        int[] distribution = dataManipulation.getDistribution(appVariables);
        // Computing PF(kj_i) for all bigrams (j) in all time-slices (i)
        for (int j = 0; j < bigrams.size(); j++) {
            String k = bigrams.get(j);
            Frequency fr = frequencies.get(j);
            float AF[] = fr.AF;
            for (int i = 1; i < intervalNumber; i++) {
                double AFk = AF[i];
                double PFk = AFk / AFtotal[i];
                if (PFk > gamma) {
                    // Storing bursty intervals
                    Bursts bursts;
                    if (mapBursts.get(k) != null) {
                        bursts = mapBursts.get(k);
                    } else {
                        bursts = new Bursts();
                    }
                    // Calculating the increase
                    double h = PFk - frequencies.get(j).AF[i - 1] / distribution[i - 1];
                    if (h > 0) {
                        bursts.list.add(new Burst(i, h));
                        mapBursts.put(k, bursts);
                    }
                }
            }
            Bursts bursts = mapBursts.get(k);
            String tweets = "";
            if (bursts != null) {
                for (Burst burst : bursts.list) {
                    tweets += dbAccess.getMessagesAsString(appVariables, k, burst.U);
                }
            }
            LinkedList<String> FCB = getFrequentBigrams(tweets, bigramSet);
            mapCooccurences.put(k, FCB);
        }
        // Freeing memory
        frequencies = null;
        indexAccess.close();

        mapBursts = getSortedMapDesc(mapBursts);
        // Clustering keywords
        String strEvents = performHAC(appVariables, bigrams, mapBursts, mapCooccurences);
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        readEventsFromString(strEvents);
        appVariables.addLogEntry("[event detection] computed ET, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", gamma=" + gamma + ", alpha=" + alpha + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(ET.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:fr.ericlab.sondy.algo.eventdetection.PeakyTopics.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {//  ww w.  j  a va 2 s . c o  m
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            smooth = Integer.parseInt(parameters.get(2).getValue());
        }
        long startNanoTime = System.nanoTime();
        DataManipulation dataManipulation = new DataManipulation();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        HashMap<DetectionResult, Float> score = new HashMap<>();
        int intervalNumber = r.numDocs();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    if (smooth > 0) {
                        frequency = dataManipulation.getSmoothedTermFrequency(frequency, smooth);
                    }
                    float tf = 0;
                    int peakIndex = 0;
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        if (frequency[i] > tf) {
                            tf = frequency[i];
                            peakIndex = i;
                        }
                    }
                    float peakDay = (peakIndex * intervalDuration) / 24;
                    float peakDay1 = ((peakIndex + 1) * intervalDuration) / 24;
                    score.put(new DetectionResult(term,
                            formatter.format(peakDay) + ";" + formatter.format(peakDay1)), tf / cf);
                }
            }
        }
        indexAccess.close();
        score = Collection.getSortedMapDesc(score);
        Set<Entry<DetectionResult, Float>> entrySet = score.entrySet();
        results = FXCollections.observableArrayList();
        for (Entry<DetectionResult, Float> entry : entrySet) {
            results.add(0, entry.getKey());
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed peaky topics, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in "
                + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}