Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:edu.utah.bmi.ibiomes.catalog.MetadataLookup.java

License:Open Source License

/**
 * Get all standard attributes from the dictionary
 * @return List of standard metadata attributes
 * @throws IOException /*  www.  j  a v  a 2s  .  c o  m*/
 * @throws CorruptIndexException 
 */
public MetadataAttributeList getAllMetadataAttributes() throws CorruptIndexException, IOException {
    logger.info("Loading list of standard metadata attributes");
    MetadataAttributeList attrs = new MetadataAttributeList();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
    for (int d = 0; d < reader.numDocs(); d++) {
        Document doc = reader.document(d);
        MetadataAttribute attribute = getAttributeFromDocument(doc);
        attrs.add(attribute);
    }
    return attrs;
}

From source file:edu.utsa.sifter.IndexResource.java

License:Apache License

@Path("index")
@POST/*from w  w w .  java  2  s .c  o  m*/
@Consumes({ MediaType.APPLICATION_JSON })
@Produces({ MediaType.APPLICATION_JSON })
public IndexInfo openIndex(IndexInfo idx) {
    if (idx.Id == null) {
        idx.Id = new String(Hex.encodeHex(Hasher.digest(idx.Path.getBytes())));
    }
    idx.Id = idx.Id.toLowerCase();

    IndexReader rdr = State.Indices.get(idx.Id);
    if (rdr == null) {
        try {
            final File evPath = new File(idx.Path);
            final File primaryIdx = new File(evPath, "primary-idx");
            final File somIdx = new File(evPath, "som-idx");
            DirectoryReader parallel[] = new DirectoryReader[2];
            parallel[0] = DirectoryReader.open(FSDirectory.open(primaryIdx));
            parallel[1] = DirectoryReader.open(FSDirectory.open(somIdx));

            rdr = new ParallelCompositeReader(parallel);
        } catch (IOException ex) {
            HttpResponse.setStatus(HttpServletResponse.SC_NOT_FOUND);
        }
    }
    if (rdr != null) {
        idx.NumDocs = rdr.numDocs();

        State.Indices.put(idx.Id, rdr);
        State.IndexLocations.put(idx.Id, idx);
    }
    return idx;
}

From source file:edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java

License:Apache License

@SuppressWarnings("deprecation")
public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath,
        String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms,
        boolean append) throws Exception {

    String index = indexDirectoryPath;
    FileWriter f = new FileWriter(index + "../NotFound.txt", true);

    for (int i = 0; i < targetClasses.length; i++) {
        String target = targetClasses[i];
        boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target);
        if (!found)
            f.append("Target doc " + i + " - " + target + " not found in index!\n");
    }/*from   www . j ava2  s  .  c o m*/
    f.close();
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true);

    int numDocs = reader.numDocs();
    System.out.println("The number of documents in the index is: " + numDocs);

    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

    String[] fields;
    fields = new String[1];
    fields[0] = "contents";

    if (!runIndividualTerms) {
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
        int hitsPerPage = numDocs;
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        Query query = parser.parse(queryString);
        searcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        System.out.println("The number of hits is: " + hits.length);

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...)
        FileWriter fwRelevant = new FileWriter(fileOutput, append);

        String path = "";
        String docName = "";
        String docPathAndName = "";
        for (String target : targetClasses) {
            boolean found = false;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                path = d.get("path");

                float score = hits[i].score;

                if (documentType == 2) {
                    docName = d.get("docName");

                    docPathAndName = path.toLowerCase() + "." + docName.toLowerCase();

                    if (target.equalsIgnoreCase(docPathAndName)) {
                        fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";"
                                + hits.length + ";" + numDocs + ";" + score + "\n");
                        found = true;
                        break;
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path.trim());
                    String fileName = pathDir.getName();
                    docName = fileName.replaceAll(".txt", "");
                    fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            if (found == false)
                fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n");

        }
        // fw.close();
        fwRelevant.close();
        reader.close();
    } else // runIndividualTerms = true
    {
        /**
         * each query will be divided in its constituent terms and each term
         * will be run as a separate query
         **/
        /**
         * this is useful to determine the similarity of each of the terms
         * in a query to a target document so that we determine which terms
         * in the query tend to lead to the best results, i.e., to finding
         * the targets sooner
         **/

        SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString,
                fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append);

        FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt"));
        fw.write(
                "\n\n\n------------------------------------------------------------------------------------\n\n");
        fw.write("                               Results for query " + queryNumber + "\n");
        fw.write("------------------------------------------------------------------------------------\n\n");

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...)
        // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...)
        // ...
        FileWriter fwRelevant = new FileWriter(
                fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt"));

        String[] queryTerms = queryString.split(" ");
        for (int l = 0; l < queryTerms.length; l++) {
            MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
            int hitsPerPage = numDocs;
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

            String q = queryTerms[l];
            Query query = parser.parse(q);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            fw.write("TERM " + (l + 1) + ": " + q + "\n\n");
            fwRelevant.write("\n" + queryNumber + "," + q);
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                String path = d.get("path");
                float score = hits[i].score;
                if (documentType == 2) {
                    String docName = d.get("docName");
                    fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n");
                    for (int k = 0; k < targetClasses.length; k++) {
                        if (docName.equalsIgnoreCase(targetClasses[k])) {
                            String contents = d.get("contents");
                            int frequency = countOccurrences(contents, q);// tf
                            fwRelevant.write("," + frequency);

                            fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df
                            fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score);
                            break;
                        }
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path);
                    String fileName = pathDir.getName();
                    String docName = fileName.replaceAll(".txt", "");
                    fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            fw.write("\n\n\n");
        }
        fw.close();
        f.close();
        fwRelevant.close();
        reader.close();
    }
}

From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java

License:Open Source License

/**
 * @see es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorService#busquedaAvanzada(es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO)
 * @param ParamAvanzadoVO VO que alberga los parametros que acepta una busqueda avanzada.
 * @return DocumentosVO Esta clase representa los resultados de una busqueda. 
 *//* ww w .ja v  a2  s  . c o  m*/
protected es.pode.indexador.negocio.servicios.busqueda.DocumentosVO handleBusquedaAvanzada(
        es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO paramBusq) throws java.lang.Exception {

    //      Implementamos la busqueda avanzada.
    DocumentosVO respuesta = new DocumentosVO();
    if (logger.isDebugEnabled())
        logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada: Parametros de la busqueda avanzada "
                + paramBusqAvanz2String(paramBusq));
    DisjunctionMaxQuery query = new DisjunctionMaxQuery(0.01f);
    long start = System.currentTimeMillis();
    Object[] hits = null;
    boolean resultadoUnico = false;

    // Some fixing with "*" and ".*"
    if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("*")
            && !paramBusq.getPalabrasClave().trim().equals(".*")) {
        if (paramBusq.getPalabrasClave().trim().toLowerCase()
                .startsWith(props.getProperty("agrega_admin") + " ")) {
            hits = new Hits[1];
            hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim()
                    .substring((props.getProperty("agrega_admin") + " ").length()), query, false, null);
        } else if (paramBusq.getPalabrasClave().trim().toLowerCase()
                .startsWith(props.getProperty("agrega_todos") + " ")) {
            IndiceVO[] idiomas = this.handleObtenerIdiomasBusqueda();
            hits = new Hits[idiomas.length];
            hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim()
                    .substring((props.getProperty("agrega_todos") + " ").length()), query, true, idiomas);
        } else {
            resultadoUnico = true;
            hits = new Hits[1];
            hits[0] = internaBusquedaAvanzada(paramBusq, query);
        }
    } else {
        resultadoUnico = true;
        hits = new Hits[1];
        hits[0] = internaBusquedaAvanzada(paramBusq, query);
    }
    long end = System.currentTimeMillis();
    logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en ="
            + (end - start) + " milisegundos.");
    if (logger.isDebugEnabled()) {
        logger.debug("Se devuelven los hits");
        logger.debug("Se devuelven los hits NULL o 0 vamos a por las sugerencias");
    }
    if (paramBusq.getBusquedaSimpleAvanzada() != null
            && !paramBusq.getBusquedaSimpleAvanzada().equals(BUSCARRSS)) {
        try {
            Directory directorioIndiceSpell = this.indiceSpell(paramBusq.getIdiomaBusqueda());
            if (logger.isDebugEnabled())
                logger.debug("El indice de spellchecker es " + directorioIndiceSpell);
            SpellChecker spellChecker = new SpellChecker(directorioIndiceSpell);
            String sQuery = query.toString();
            //         TODO: retocar las sugerencias para que tengan en cuenta algo mas que los parametros de "keywords"?
            if (!spellChecker.exist(sQuery)
                    && (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().equals(""))) {
                List claveBusqueda = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(),
                        false);
                List<String> sugerencias = new ArrayList<String>();
                if (claveBusqueda != null && claveBusqueda.size() > 0) {
                    boolean suficientes = false;
                    for (int i = 0; i < claveBusqueda.size() && !suficientes; i++) {
                        if (!((String) claveBusqueda.get(i)).equals("")) {
                            String[] suge = spellChecker.suggestSimilar((String) claveBusqueda.get(i),
                                    NUMERO_SUGERENCIAS);
                            if (suge != null && suge.length > 0) {
                                for (int k = 0; k < suge.length
                                        && sugerencias.size() < NUMERO_SUGERENCIAS; k++) {
                                    boolean encontrado = false;
                                    for (int j = 0; j < sugerencias.size() && !encontrado; j++) {
                                        if (sugerencias.get(j).toString().equals(suge[k]))
                                            encontrado = true;
                                    }
                                    if (!encontrado && validarPersonalizada(paramBusq)) {
                                        Hits hitSugerencias = null;
                                        ParamAvanzadoVO paramBusqSug = paramBusq;
                                        paramBusqSug.setPalabrasClave(suge[k]);
                                        try {
                                            hitSugerencias = internaBusquedaAvanzada(paramBusqSug, query);
                                            if (hitSugerencias != null && hitSugerencias.length() > 0)
                                                sugerencias.add(suge[k]);
                                        } catch (Exception e) {
                                            logger.error(
                                                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando comprobaci n sugerencia avanzada. Sugerencia="
                                                            + suge[k],
                                                    e);
                                        }
                                    } else if (!encontrado && !validarPersonalizada(paramBusq))
                                        sugerencias.add(suge[k]);
                                }
                            }
                            if (sugerencias.size() == NUMERO_SUGERENCIAS)
                                suficientes = true;
                        }
                    }
                }
                String[] cargarSugerencias = new String[] {};
                if (sugerencias != null && sugerencias.size() > 0) {
                    cargarSugerencias = new String[sugerencias.size()];
                    for (int i = 0; i < sugerencias.size(); i++) {
                        cargarSugerencias[i] = sugerencias.get(i);
                    }
                }
                respuesta.setSugerencias(cargarSugerencias);
            } else
                respuesta.setSugerencias(new String[] {});
        } catch (Exception e) {
            logger.error(
                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando sugerencias para idioma:"
                            + paramBusq.getIdiomaBusqueda(),
                    e);
            respuesta.setSugerencias(new String[] {});
        }
        try {
            es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {};
            if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("")) {
                List palabrasTesauro = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(),
                        true);
                List<String> nombreTesauros = new ArrayList<String>();
                List<String> identificadorTesauros = new ArrayList<String>();
                if (palabrasTesauro != null && palabrasTesauro.size() > 0) {
                    int numeroTax = 0;
                    for (int i = 0; i < palabrasTesauro.size()
                            && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); i++) {
                        TaxonVO[] taxones = this.getSrvTesaurosServices().obtenerTerminosRelacionadosPorTexto(
                                (String) palabrasTesauro.get(i), props.getProperty("nombre_tesauro"),
                                paramBusq.getIdiomaBusqueda());
                        String[] idTesauro = new String[taxones.length];
                        for (int k = 0; k < taxones.length; k++) {
                            idTesauro[k] = taxones[k].getId();
                        }
                        for (int k = 0; k < taxones.length
                                && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); k++) {
                            Integer[] tesauros = NumTermsArbol
                                    .obtenerNumeroNodos(idTesauro,
                                            getIndexPathByLanguage(paramBusq.getIdiomaBusqueda()), "tesauro")
                                    .getConteo();
                            if (idTesauro != null && idTesauro.length != 0) {
                                for (int j = 0; j < idTesauro.length; j++) {
                                    if (idTesauro[j].equals(taxones[k].getId())) {
                                        if (tesauros[j].intValue() > 0) {
                                            nombreTesauros.add(taxones[k].getValorTax());
                                            identificadorTesauros.add(taxones[k].getId());
                                            numeroTax = numeroTax + 1;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                if (nombreTesauros != null && nombreTesauros.size() > 0) {
                    cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[nombreTesauros
                            .size()];
                    for (int i = 0; i < nombreTesauros.size(); i++) {
                        cargarTesauros[i] = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO(
                                identificadorTesauros.get(i).toString(), nombreTesauros.get(i).toString());
                    }
                }
                respuesta.setTesauros(cargarTesauros);
            } else
                respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {});
        } catch (Exception e) {
            logger.error(
                    "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error obteniendo sugerencias de tesauro ["
                            + props.getProperty("nombre_tesauro") + "] con:" + paramBusq.getPalabrasClave()
                            + " n mero de tesauros m ximo solicitado=" + props.getProperty("numero_tesauros")
                            + " e idioma=" + paramBusq.getIdiomaBusqueda(),
                    e);
            respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {});
        }
    }
    if (hits == null || (resultadoUnico && hits[0] == null)) {
        respuesta.setTotalResultados(new Integer(0));
        respuesta.setNumeroResultados(new Integer(0));
        respuesta.setNumDocumentosIndice(new Integer(0));
    } else {
        long start2 = System.currentTimeMillis();
        if (hits.length > 1) {
            int totalResultados = 0;
            List docsList = new ArrayList();
            for (int i = 0; i < hits.length
                    && docsList.size() <= paramBusq.getNumeroResultados().intValue(); i++) {
                if (hits[i] != null && ((Hits) hits[i]).length() > 0) {
                    totalResultados = totalResultados + ((Hits) hits[i]).length();
                    DocVO[] docs = this.getArrayDocsFromHits((Hits) hits[i],
                            ((((Hits) hits[i]).length() < paramBusq.getNumeroResultados().intValue())
                                    || paramBusq.getNumeroResultados().intValue() == -1)
                                            ? ((Hits) hits[i]).length()
                                            : paramBusq.getNumeroResultados().intValue());
                    for (int j = 0; j < docs.length; j++) {
                        docsList.add(docs[j]);
                    }
                }
            }
            DocVO[] docs = new DocVO[docsList.size()];
            for (int i = 0; i < docs.length; i++) {
                docs[i] = (DocVO) docsList.get(i);
            }
            respuesta.setTotalResultados(new Integer(totalResultados));
            respuesta.setResultados(docs);
        } else {
            respuesta.setTotalResultados(new Integer(((Hits) hits[0]).length()));
            respuesta.setResultados(this.getArrayDocsFromHits((Hits) hits[0],
                    ((((Hits) hits[0]).length() < paramBusq.getNumeroResultados().intValue())
                            || paramBusq.getNumeroResultados().intValue() == -1) ? ((Hits) hits[0]).length()
                                    : paramBusq.getNumeroResultados().intValue()));
        }
        end = System.currentTimeMillis();
        logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Mapeo local realizado en ="
                + (end - start2) + " milisegundos.");
        IndexReader reader = IndexReader.open(this.getIndexByLanguage(paramBusq.getIdiomaBusqueda()));
        respuesta.setNumDocumentosIndice(new Integer(reader.numDocs()));
        respuesta.setNumeroResultados(new Integer(respuesta.getResultados().length));
    }
    logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en ="
            + (end - start) + " milisegundos.");
    return respuesta;
}

From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java

License:Open Source License

/**
 * Este metodo busca un ODE al azar de dentro del repositorio.
 * @return DocVO Detalle de un ODE indexado.
 *//* ww  w. j  a va2  s. c om*/
protected DocVO handleObtenerODERandom() throws Exception {
    List listaIndices = (List) this.getIndiceDao().loadAll(getIndiceDao().TRANSFORM_INDICEVO);
    if (listaIndices.size() == 0)//      No hay indices que listar, no devuelvo nada
        return null;
    Random random = new Random(Calendar.getInstance().getTimeInMillis());
    Document doc = null;
    boolean noCero = true;
    int intRandom = random.nextInt();
    int i = 0;
    int reintentosInt = 10; //Puede que intRandom sea 0, para que no sea as  haremos 10 intentos como mucho
    for (i = 0; i < reintentosInt && intRandom != 0 && noCero; i++) {

        //      Sacamos el indice aleatoriamente de todos los indices del repositorio
        int idiomaSeleciconado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % listaIndices.size();
        IndiceVO indice = (IndiceVO) listaIndices.get(idiomaSeleciconado);
        //      Abrimos el indice y vemos el numero de documentos indexados
        Directory directorioIndiceSimple = null;
        directorioIndiceSimple = this.getIndexByLanguage(indice.getIdentificador());
        IndexReader indiceLectura = IndexReader.open(directorioIndiceSimple);
        int numeroDocumentos = indiceLectura.numDocs();
        logger.debug("El numero de documentos del indice es " + numeroDocumentos);
        //      Seleccionamos el documento que vamos a extraer
        if (numeroDocumentos > 0) {
            intRandom = random.nextInt();
            noCero = false;
            int documentoSeleccionado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % numeroDocumentos;
            logger.info("Devuelto documento [" + documentoSeleccionado + "] de [" + numeroDocumentos
                    + "] documentos totales indexados.");
            doc = indiceLectura.document(documentoSeleccionado);
        }
        indiceLectura.close();
    }
    if (i == reintentosInt && noCero) {
        logger.info("No se ha encontrado ning n random  v lido en [" + reintentosInt + "] intentos");
    }
    if (doc != null)
        return getVOFromLucene(doc, new DocVO(), 0);
    else
        return null;
}

From source file:eu.eexcess.sourceselection.redde.dbsampling.DBSampler.java

License:Apache License

/**
 * Estimates the database size of general database using sample-resample and
 * search term "term".//  w  ww.  j  a  v  a  2s  .  c o  m
 * 
 * @param term
 *            one-term search term for general and sampled index
 * @return the estimated database size of the general index
 * @throws ParseException
 * @throws IOException
 * @throws IllegalArgumentException
 *             if an index (base or sampled) contains no documents
 */
private double resample(String term) throws ParseException, IOException, IllegalArgumentException {

    Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer()).parse(term);

    outIndexWriter.commit();

    if (inIndexReader.numDocs() <= 0) {
        throw new IllegalArgumentException("failed to resample using empty index [inIndexReader]");
    } else if (outIndexWriter.numDocs() <= 0) {
        throw new IllegalArgumentException("failed to resample using empty index [outIndexWriter]");
    }

    double estimation = 0;
    IndexReader sampleIndexReader = null;

    try {
        // get total hits for term in sample index
        sampleIndexReader = DirectoryReader.open(outIndexWriter, true);
        IndexSearcher sampleIndexSearcher = new IndexSearcher(sampleIndexReader);
        TopDocs sampleSearchDocs = sampleIndexSearcher.search(query, sampleIndexReader.numDocs());

        // get total hits for term in general index
        IndexSearcher generalIndexSearcher = new IndexSearcher(inIndexReader);
        TopDocs generalSearchDocs = generalIndexSearcher.search(query, inIndexReader.numDocs());

        estimation = estimationCalculator(generalSearchDocs.totalHits, sampleSearchDocs.totalHits,
                sampleIndexReader.numDocs(), true);
    } finally {
        if (sampleIndexReader != null) {
            sampleIndexReader.close();
        }
    }

    return estimation;
}

From source file:eu.eexcess.sourceselection.redde.Redde.java

License:Apache License

private ScoreDoc[] scoreDatabase(String queryString, IndexReader database) throws ParseException, IOException {

    Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer())
            .parse(queryString);//from  w ww. j  av  a  2s  .c om
    IndexSearcher searcher = new IndexSearcher(database);
    TopDocs topDocs = searcher.search(query, database.numDocs());

    return topDocs.scoreDocs;
}

From source file:fr.ericlab.sondy.algo.eventdetection.EDCoW.java

License:Open Source License

@Override
public ObservableList<DetectionResult> apply() {
    try {/*w w  w  .java  2s  .  co m*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            delta = Integer.parseInt(parameters.get(2).getValue());
        }
        if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) {
            gamma = Integer.parseInt(parameters.get(3).getValue());
        }
        if (parameters.get(4).getValue() != null && !parameters.get(4).getValue().equals("")) {
            delta2 = Integer.parseInt(parameters.get(4).getValue());
        }
        long startNanoTime = System.nanoTime();
        int intervals = appVariables.messageSet.nbTimeSlice;
        int windows = intervals / delta2;
        events = new LinkedList<>();

        termDocsMap = new HashMap<>();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequencyf[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequencyf[r.numDocs()];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    termDocsMap.put(term, frequencyf);
                }
            }
        }
        indexAccess.close();
        for (int i = 0; i < windows; i++) {
            processWindow(i);
        }
        Collections.sort(events);
        results = FXCollections.observableArrayList();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        for (EDCoWEvent ev : events) {
            double interval[] = ev.getInterval(intervalDuration);
            results.add(new DetectionResult(ev.getKeywordsAsString(),
                    formatter.format(interval[0]) + ";" + formatter.format(interval[1])));
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed EDCoW, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", delta=" + delta + ", gamma=" + gamma + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (NumberFormatException | IOException ex) {
        Logger.getLogger(EDCoW.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:fr.ericlab.sondy.algo.eventdetection.ET.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {/*from ww  w .  j  ava2 s  . co  m*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            gamma = Double.parseDouble(parameters.get(2).getValue());
        }
        if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) {
            alpha = Double.parseDouble(parameters.get(3).getValue());
        }

        long startNanoTime = System.nanoTime();
        HashMap<String, Bursts> mapBursts = new HashMap<>();
        HashMap<String, LinkedList<String>> mapCooccurences = new HashMap<>();
        LinkedList<String> bigrams = new LinkedList<>();
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages);
        int maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);

        DBAccess dbAccess = new DBAccess();
        dbAccess.initialize(appVariables, false);
        DataManipulation dataManipulation = new DataManipulation();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        int intervalNumber = r.numDocs();
        TermEnum allTerms = r.terms();
        LinkedList<Frequency> frequencies = new LinkedList<>();
        while (allTerms.next()) {
            String k = allTerms.term().text();
            if (!appVariables.isStopWord(k)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    bigrams.add(k);
                    frequencies.add(new Frequency(k, frequency, cf));
                }
            }
        }
        HashSet<String> bigramSet = new HashSet<>();
        bigramSet.addAll(bigrams);

        // Pre-computing AF total in all time-slices
        double AFtotal[] = new double[intervalNumber];
        for (int a = 0; a < intervalNumber; a++) {
            double AFa = 0;
            for (int b = 0; b < bigrams.size(); b++) {
                AFa += frequencies.get(b).AF[a];
            }
            AFtotal[a] = AFa;
        }

        int[] distribution = dataManipulation.getDistribution(appVariables);
        // Computing PF(kj_i) for all bigrams (j) in all time-slices (i)
        for (int j = 0; j < bigrams.size(); j++) {
            String k = bigrams.get(j);
            Frequency fr = frequencies.get(j);
            float AF[] = fr.AF;
            for (int i = 1; i < intervalNumber; i++) {
                double AFk = AF[i];
                double PFk = AFk / AFtotal[i];
                if (PFk > gamma) {
                    // Storing bursty intervals
                    Bursts bursts;
                    if (mapBursts.get(k) != null) {
                        bursts = mapBursts.get(k);
                    } else {
                        bursts = new Bursts();
                    }
                    // Calculating the increase
                    double h = PFk - frequencies.get(j).AF[i - 1] / distribution[i - 1];
                    if (h > 0) {
                        bursts.list.add(new Burst(i, h));
                        mapBursts.put(k, bursts);
                    }
                }
            }
            Bursts bursts = mapBursts.get(k);
            String tweets = "";
            if (bursts != null) {
                for (Burst burst : bursts.list) {
                    tweets += dbAccess.getMessagesAsString(appVariables, k, burst.U);
                }
            }
            LinkedList<String> FCB = getFrequentBigrams(tweets, bigramSet);
            mapCooccurences.put(k, FCB);
        }
        // Freeing memory
        frequencies = null;
        indexAccess.close();

        mapBursts = getSortedMapDesc(mapBursts);
        // Clustering keywords
        String strEvents = performHAC(appVariables, bigrams, mapBursts, mapCooccurences);
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        readEventsFromString(strEvents);
        appVariables.addLogEntry("[event detection] computed ET, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", gamma=" + gamma + ", alpha=" + alpha + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(ET.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:fr.ericlab.sondy.algo.eventdetection.PeakyTopics.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {//  ww w.  j  a va 2 s . c o  m
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            smooth = Integer.parseInt(parameters.get(2).getValue());
        }
        long startNanoTime = System.nanoTime();
        DataManipulation dataManipulation = new DataManipulation();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        HashMap<DetectionResult, Float> score = new HashMap<>();
        int intervalNumber = r.numDocs();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    if (smooth > 0) {
                        frequency = dataManipulation.getSmoothedTermFrequency(frequency, smooth);
                    }
                    float tf = 0;
                    int peakIndex = 0;
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        if (frequency[i] > tf) {
                            tf = frequency[i];
                            peakIndex = i;
                        }
                    }
                    float peakDay = (peakIndex * intervalDuration) / 24;
                    float peakDay1 = ((peakIndex + 1) * intervalDuration) / 24;
                    score.put(new DetectionResult(term,
                            formatter.format(peakDay) + ";" + formatter.format(peakDay1)), tf / cf);
                }
            }
        }
        indexAccess.close();
        score = Collection.getSortedMapDesc(score);
        Set<Entry<DetectionResult, Float>> entrySet = score.entrySet();
        results = FXCollections.observableArrayList();
        for (Entry<DetectionResult, Float> entry : entrySet) {
            results.add(0, entry.getKey());
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed peaky topics, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in "
                + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}