it.unipd.dei.ims.falcon.indexing.Indexing.java Source code

Introduction

Here is the source code for it.unipd.dei.ims.falcon.indexing.Indexing.java
Source

package it.unipd.dei.ims.falcon.indexing;

/**
 * Copyright 2010 University of Padova, Italy
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import it.unipd.dei.ims.falcon.analysis.chromafeatures.ChromaMatrixUtils;
import it.unipd.dei.ims.falcon.analysis.transposition.TranspositionEstimator;
import it.unipd.dei.ims.falcon.ranking.HashSimilarity;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.io.PrintWriter;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

/**
 * Indexing class provides functionalities to index songs in a specified folder.
 * Each song needs to be represented as a text file, whose content is the
 * sequence of hashes extracted during the analysis phase. The entire hash sequence
 * for the song needs to be in a single text line. 
 * <p>
 * The basic rationale underlying the indexing step is that each song is mapped
 * in a set of possible overlapping subsequence of hashes of fixed length.
 * Each subsequence is named "segment". Both the number of hashes per segment,
 * namely the segment length, and the number of hashes in the overlap, namely
 * the overlap size, are specified as parameters for the method
 * {@link it.unipd.dei.ims.falcon.indexing.Indexing#index(java.lang.String, java.lang.String, int, int)}.
 * together with the full path to the folder containing the song collection and
 * the full path to the folder where the index will be stored.
 * <p>
 * Each obtained segment is mapped in a Lucene {@link org.apache.lucene.document.Document}
 * and written in the index. Each segment {@link org.apache.lucene.document.Document}
 * has three Lucene {@link org.apache.lucene.document.Field}'s:
 * <ul>
 *  <li> "CONTENT": sequence of hashes for the current segments; a white space
 *       is adopted as hash delimiter; the configuration currently used for this
 *       {@link org.apache.lucene.document.Field} is:
 *       <ul>
 *          <li>{@link org.apache.lucene.document.Field.Store#NO}
 *          <li>{@link org.apache.lucene.document.Field.Index#ANALYZED}
 *          <li>{@link org.apache.lucene.document.Field.TermVector#NO}
 *        </ul>
 *  <li> "TITLE": identifier of the song which the segment belongs to, e.g. "song2".
 *       The configuration currently used for this {@link org.apache.lucene.document.Field} is:
 *       <ul>
 *          <li>{@link org.apache.lucene.document.Field.Store#YES}
 *          <li>{@link org.apache.lucene.document.Field.Index#NOT_ANALYZED_NO_NORMS}
 *       </ul>
 *  <li> "ID": identifier of the segment; for instance the identifier "song2_4"
 *       denotes the fourth segment of the song "song2". The configuration
 *       currently used for this {@link org.apache.lucene.document.Field} is:
 *       <ul>
 *          <li>{@link org.apache.lucene.document.Field.Store#YES}
 *          <li>{@link org.apache.lucene.document.Field.Index#NOT_ANALYZED_NO_NORMS}
 *       </ul>
 * </ul>
 * If the total number of hashes in a segment is not a multiple of the specified
 * number of hashes per segment, the remaining part of the sequence is truncated.
 * <p>
 *
 * After indexing, collection wide statistics for each hash are computed and
 * stored in the file "qpruning_features.map" in the index folder.
 * Each line in this file corresponds to a distinct hash and contains four entries:
 * <ol>
 *  <li>the hash value;
 *  <li>the normalized document frequency, that is the number of document,
 *      namely segments, where the hash occurs, divided by the total number
 *      of segments in the index;
 *  <li>the normalized total collection frequency of the hash, that is the total
 *      number of occurrence of the hash in the entire collection divided by
 *      the sum of the total collection frequency of all the distinct hashes;
 *  <li>normalized maximum frequency for the current hash, that is the maximum
 *      value computed over all the segments in the index of the number of
 *      hash occurrence in a segment divided by the segment length.
 * </ol>
 * When the index is updated, the map is re-built.
 * <p>
 * 
 */
public class Indexing {

    /**
     * Indexes all the songs in the specified path.
     * The index is created in the specified directory "indexPath". If an index
     * already exists in that path, adds the songs to the existing index.
     * Each song is processed by the method
     * {@link it.unipd.dei.ims.falcon.indexing.Indexing#indexSong}
     * which maps the song into a set of segments, each of one is mapped in a
     * Lucene {@link org.apache.lucene.document.Document}.
     * The segments have fixed length, specifically are constituted by 
     * "hashPerSegment" hashes. There can be an overlap of "hashInOverlap"
     * hashes between two segments. The number of hash in the overlap must be
     * smaller than the number of hash per segments, otherwise an
     * {@link it.unipd.dei.ims.falcon.indexing.IndexingException} is thrown.
     * <p>
     * Once the index has been created or updated, writes a map into a file.
     * The map associates a set of features to each hash. Those features are
     * based on occurrence statistics of the hash in the entire collection.
     * In the event of an index update the map is re-built and the map file
     * is over-written.
     * @param data Input file. If it is a directory, index all files inside it.
     * @param index Falcon index.
     * @param hashPerSegment Number of hashes per segment.
     * @param hashInOverlap Number of overlapping hashes per segment.
     * @throws IndexingException 
     */
    public static void index(File data, File index, final int hashPerSegment, final int hashInOverlap,
            final int subsampling, final int nranks, final double minkurtosis,
            final TranspositionEstimator transpEst, boolean verbose) throws IndexingException, IOException {

        long start_time = System.currentTimeMillis();

        if (hashPerSegment <= hashInOverlap)
            throw new IndexingException(
                    "Number of hashes in the overlap cannot be equal to the number of hash per segment");

        if (!data.canRead())
            throw new IOException("cannot read input path");
        if (data.isDirectory()) {
            for (File f : data.listFiles())
                if (!f.canRead())
                    throw new IOException("cannot read one or more input files");
        }

        if (!index.exists()) // if index is being created rather than updated
            index.mkdir();
        if (!index.canWrite())
            throw new IOException("cannot write to index directory");

        SimpleFSDirectory indexDir = new SimpleFSDirectory(index, null);

        // initialize Lucene Analyzer and IndexWriter
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        final IndexWriter writer = new IndexWriter(indexDir, analyzer, !IndexReader.indexExists(indexDir),
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setSimilarity(new HashSimilarity());

        // transform chroma data into hashes and write into index
        File[] inputfiles = data.isDirectory() ? data.listFiles() : new File[] { data };
        int fileNo = 0;
        for (final File file : inputfiles) {
            // if the current considered files exists and is not hidden
            if (file.exists() && !file.getName().startsWith(".")) {
                if (verbose)
                    System.out.println(String.format("%10.3f%% - indexing %s", fileNo * 100. / inputfiles.length,
                            file.getAbsolutePath()));
                final List<OutputStream> fout = new LinkedList<OutputStream>();
                fout.add(new PipedOutputStream());
                final PipedInputStream fin = new PipedInputStream((PipedOutputStream) fout.get(0));
                Thread t = new Thread(new Runnable() {
                    public void run() {
                        try {
                            ChromaMatrixUtils.convertChromaStreamIntoHashesStream(new FileReader(file), fout,
                                    nranks, transpEst, minkurtosis, subsampling);
                        } catch (IOException ex) {
                            // TODO do something better for this exception ... (might hang all ...)
                            Logger.getLogger(Indexing.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                });
                t.start();
                indexSong(writer, fin, hashPerSegment, hashInOverlap, file.getAbsolutePath(),
                        file.getAbsolutePath());
                fileNo++;
            }
        }
        writer.optimize();
        writer.close();

        // additional falcon features
        PrintWriter pw = new PrintWriter(index.getAbsolutePath() + "/qpruning_features.map");
        IndexReader reader = IndexReader.open(new SimpleFSDirectory(index));
        int numSegments = reader.numDocs();
        long total_hcf = numSegments * hashPerSegment; // total number of hashes in the collection
        TermEnum hashes = reader.terms(); // distinct hashes in the collection

        while (hashes.next()) {
            if (!hashes.term().field().equals("CONTENT")) {
                continue;
            }
            Term curHash = hashes.term();
            pw.print(curHash.text() + "\t");
            pw.print((double) reader.docFreq(curHash) / numSegments + "\t"); // normalized document frequency
            TermDocs curHash_pl = reader.termDocs(curHash); // posting list for the current hash
            // computation of the frequency of the current hash in the
            // entire collection -- value initialization
            long hcf = 0;
            // initializes the normalized maximum frequency value
            double nmf = 0;
            // initializes the normalized frequency for max computation
            double cur_nf = 0;
            // processes posting list entries
            while (curHash_pl.next()) {
                // computation of the normalized frequency for
                // the current hash
                cur_nf = (double) curHash_pl.freq() / hashPerSegment;
                // update max if necessary
                if (cur_nf > nmf) {
                    nmf = cur_nf;
                }
                hcf += curHash_pl.freq();
            }
            // prints normalized total collection frequency and
            // normalized maximum frequency for the current hash
            pw.print((double) hcf / total_hcf + "\t" + nmf + "\n");
        }
        pw.flush();
        pw.close();

        long end_time = System.currentTimeMillis();
        if (verbose)
            System.out.println(String.format("[INDEXING] - elapsed time: %10.3f", (end_time - start_time) / 1000.));

    }

    /**
     * Maps the song in the input file in a set of {@link org.apache.lucene.document.Document}'s and index them.
     * Each song is divided in a set of possibly overlapping segments of fixed
     * length; in particular "hashPerSegment" is the number of hash in each
     * segment. The number of hashes in the overlap is "hashInOverlap".
     * If the number of hashes in the last segment is less the "hashPerSegment",
     * this sequence of hashes is discarded.
     * Each segment is mapped in a {@link org.apache.lucene.document.Document}
     * with three {@link org.apache.lucene.document.Field}'s:
     * <ul>
     *  <li> "CONTENT": sequence of hashes in the current segment
     *  <li> "TITLE": title of the file the current segment belongs to
     *  <li> "ID": identifier of the segments, that is the form "TITLE"_i,
     *       where "i" is the consecutive segment number in the song, e.g.
     *       "song2_3" denotes the third segment of the song "song2
     * </ul>
     * 
     * @param writer
     *          {@link org.apache.lucene.index.IndexWriter} for the current index
     * @param strBuilder
     *          {@link java.lang.StringBuilder} where each segment is stored during indexing
     * @param file
     *          file where the text representation of the song is stored
     * @param hashPerSegment
     *          number of hashes in each segment
     * @param hashInOverlap
     *          number of hashes in the overlap among segments
     * @throws FileNotFoundException
     * @throws IOException
     */
    private static void indexSong(IndexWriter writer, InputStream is, int hashPerSegment, int hashInOverlap,
            String title, String id) throws FileNotFoundException, IOException {

        BufferedReader buffReader = new BufferedReader(new InputStreamReader(is));
        String content = buffReader.readLine();
        Scanner scanner = new Scanner(content);

        // number of segments in the current document
        int hashSegment = 1;
        // number of hash processed in the current segment
        int curHashInSegment = 0;

        LinkedList<Integer> hashCache = new LinkedList<Integer>();

        StringBuilder strBuilder = new StringBuilder();

        while (scanner.hasNext()) {

            int curHash = Integer.parseInt(scanner.next());
            if (curHash == -1) {
                continue;
            }

            strBuilder.append(curHash).append(" ");
            hashCache.add(curHash);

            curHashInSegment++;

            // number of hash processed and in the buffer equals the number
            // of hash per segment
            if (hashPerSegment - curHashInSegment == 0) {

                // create a Lucene Document for the current segment
                Document doc = new Document();
                // add the field for the content of the document
                // this field will be analyzed and indexed, but not store
                // in its non-parsed form
                doc.add(new Field("CONTENT", strBuilder.toString(), Field.Store.NO, Field.Index.ANALYZED,
                        Field.TermVector.NO));
                // add the field for the identifier of the currently processed song
                doc.add(new Field("TITLE", title, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

                // add the field for the identifier of the current segment
                doc.add(new Field("ID", id + "_" + hashSegment, Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS));

                // add a filed to store the length of this segment
                doc.add(new Field("LENGTH", Integer.toString(hashPerSegment), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS));
                writer.addDocument(doc);

                if (strBuilder.length() > 0) {
                    // creates a new string builder for the next segment
                    strBuilder = new StringBuilder();

                    if (hashInOverlap == 0) { // if the overlap size is zero
                        // the number of segment already processed for the next
                        // segment is zero
                        curHashInSegment = 0;
                    } else {
                        // if overlap is required, removes from the cache
                        // the segments not in the overlap
                        for (int h = 0; h < hashPerSegment - hashInOverlap; h++) {
                            hashCache.poll();
                        }
                        // fills the buffer with the hashes in the overlap
                        int h = 0;
                        Iterator<Integer> iter = hashCache.iterator();
                        while (iter.hasNext() && h < hashInOverlap) {
                            strBuilder.append(iter.next()).append(" ");
                            h++;
                        }
                        // update the number of hashes currently processed for
                        // the next segment to the number of hashes in the overlap
                        // since they are in common for the two segments
                        curHashInSegment = hashInOverlap;
                    }
                }
                // increases the number of segments for the current song
                hashSegment++;
            }
        }
    }

    /**
     * Prints information on the songs stored in the index in the specified path.
     * The specific information printed is that specified by the "option".
     * Available options are:
     * <ul>
     *  <li> "show_doc_ids": prints the internal index identifier of all the
     *       segments in the index together with the title of the song which
     *       the segment belongs to;
     *  <li> "show_seg_ids": prints the internal index identifier of all the
     *       segments in the index together with the segment identifier;
     *  <li> "show_full_index": print all the distinct hashes in the index
     *       and the posting list associated to each hash     *
     * </ul>
     *  
     * @param indexPath
     *                  full path to the folder where the index is stored
     * @param option
     *                  option which specified the requested information
     *                  
     * @throws IndexingException
     */
    public static void indexUtils(String indexPath, String option) throws IndexingException {
        IndexReader reader;
        try {
            reader = IndexReader.open(new SimpleFSDirectory(new File(indexPath), null));

            if (option.equals("show_doc_ids")) {
                //  prints all the internal segment identifiers together with
                //  the title of the song of the considered segment.
                //  For instance, "[6] song2" denotes that the segment with
                //  internal identifier "6" belongs to the song with title "song2"
                for (int d = 0; d < reader.numDocs(); d++) {
                    System.out.println("[" + d + "] " + reader.document(d).getField("TITLE").stringValue());
                }
            } else if (option.equals("show_seg_ids")) {
                //  prints all the internal segment identifiers together with
                //  the identifier of the segment.
                //  For instance, "[8] song2_3" denotes that the third segment
                //  of "song2" has internal identifier "8
                for (int d = 0; d < reader.numDocs(); d++) {
                    System.out.println("[" + d + "] " + reader.document(d).getField("ID").stringValue());
                }

            } else if (option.equals("show_full_index")) {
                // print the full index, that is each hash with the associated
                // posting list
                TermEnum terms = reader.terms();
                while (terms.next()) {
                    System.out.print(terms.term() + " [SF: " + terms.docFreq() + "] <");

                    TermPositions poss = reader.termPositions(terms.term());
                    while (poss.next()) {
                        System.out.print(" " + reader.document(poss.doc()).getField("ID").stringValue() + " ("
                                + poss.freq() + "), ");
                    }
                    System.out.println(">");
                }
            }
        } catch (CorruptIndexException ex) {
            throw new IndexingException("CorruptIndexException when accessing index for printing information");
        } catch (IOException ex) {
            throw new IndexingException("IOException when accessing index for printing information");
        }
    }
}