org.karsha.base.DocIndexer.java Source code

Introduction

Here is the source code for org.karsha.base.DocIndexer.java
Source

/*
 *   KarshaAnnotate- Annotation tool for financial documents
 *  
 *   Copyright (C) 2013, Lanka Software Foundation and and University of Maryland.
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU Affero General Public License as
 *   published by the Free Software Foundation, either version 3 of the
 *   License, or (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU Affero General Public License for more details.
 *
 *   You should have received a copy of the GNU Affero General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.karsha.base;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.karsha.entities.DocSection;
import org.karsha.tokenize.DefaultTokenizer;
/*
import org.pdfbox.cos.COSDocument;
import org.pdfbox.io.RandomAccessBuffer;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
* 
* /
/*
 * import org.apache.pdfbox.cos.COSDocument; import
 * org.apache.pdfbox.io.RandomAccessBuffer; import
 * org.apache.pdfbox.pdfparser.PDFParser; import
 * org.apache.pdfbox.pdmodel.PDDocument; import
 * org.apache.pdfbox.util.PDFTextStripper; /
 *
 * /**
 * Copyright (C) 2012, Lanka Software Foundation.
 *
 * Date Author Changes April 14, 2012 Kasun Perera Created
 *
 */

/**
 *
 * Class contains methods for indexing documents with Lucene, and calculating
 * TFIDF weights
 */
public class DocIndexer {

    private String docNames[];
    private String pathToIndex;
    private String fiboTermList[]; //marked up fibo terms
    private String taxoTermList[]; // marked up taxonomy terms
    private RAMDirectory ramMemDir;
    private String fileNames[];
    private byte files[][];
    private String filesInText[];
    int noOfWordsOfDOc[];
    int noOfSentencesOfDoc[];
    ArrayList<String> ArrLstSentencesOfDoc[];
    String removedTermsOfDOc[][];
    int freqAfterRemovalOfDoc[][];
    //int queryDocIndex ;
    private int curDocNo;
    private final int maxTerms = 1000000;

    /**
     * Constructor used when indexing directory is the Local directory
     *
     * @param fileNames-The physical Location of the documents
     * @param docNames -Corresponding Document name List
     */
    /*
     * public DocIndexer(String fileNames[], String docNames[]) {
     *
     * this.docNames = docNames; this.pathToIndex =
     * "C:\\Users\\lsf\\Desktop\\index"; this.fileNames = fileNames;
     *
     * //this.ramMemDir = new RAMDirectory(); //pathToIndex = new
     * RAMDirectory().toString();;//this.bufPathToIndex.toString() ;
     * //this.queryDocIndex = queryDocIndex ; int len = fileNames.length;
     * this.noOfWordsOfDOc = new int[len]; this.ArrLstSentencesOfDoc = new
     * ArrayList[len]; this.noOfSentencesOfDoc = new int[len];
     * this.removedTermsOfDOc = new String[len][]; this.freqAfterRemovalOfDoc =
     * new int[len][]; this.curDocNo = 0; //this.termsOfFIBO = fiboTerms ;
     * //this.termsOfFIBO = fiboTerms ; }
     */
    /**
     * Constructor used when indexing directory is the Local directory and FIBO
     * and Taxomommy terms taken into indexing
     *
     * @param fileNames-The physical Location of the documents
     * @param docNames -Corresponding Document name List
     * @param fiboTermList-Corresponding FIBO term List
     * @param taxoTermList-Corresponding Taxonomy terms List
     */
    public DocIndexer(String fileNames[], String docNames[], String fiboTermList[], String taxoTermList[]) {

        this.docNames = docNames;
        //this.pathToIndex = "C:\\Users\\lsf\\Desktop\\index" ;
        this.fileNames = fileNames;
        this.fiboTermList = fiboTermList;
        this.taxoTermList = taxoTermList;

        this.ramMemDir = new RAMDirectory();
        //pathToIndex = new RAMDirectory().toString();;//this.bufPathToIndex.toString() ;
        //this.queryDocIndex = queryDocIndex ;
        int len = fileNames.length;
        this.noOfWordsOfDOc = new int[len];
        this.ArrLstSentencesOfDoc = new ArrayList[len];
        this.noOfSentencesOfDoc = new int[len];
        this.removedTermsOfDOc = new String[len][];
        this.freqAfterRemovalOfDoc = new int[len][];
        this.curDocNo = 0;
        //this.termsOfFIBO = fiboTerms ;
        //this.termsOfFIBO = fiboTerms ;
    }

    /**
     * Constructor used when indexing directory is a RAM memory directory, We
     * need RAM directory because Stratoes Server dosen't allow access local
     * files
     *
     * @param files- List of Documents converted in to bytes
     * @param docNames -Corresponding Document names
     */
    public DocIndexer(byte files[][], String docNames[]) {
        this.docNames = docNames;

        //this.bufPathToIndex= new RandomAccessBuffer() ;
        this.ramMemDir = new RAMDirectory();
        //pathToIndex = new RAMDirectory().toString();;//this.bufPathToIndex.toString() ;
        this.files = files;
        //this.queryDocIndex = queryDocIndex ;
        int len = files.length;
        this.noOfWordsOfDOc = new int[len];
        this.ArrLstSentencesOfDoc = new ArrayList[len];
        this.noOfSentencesOfDoc = new int[len];
        this.removedTermsOfDOc = new String[len][];
        this.freqAfterRemovalOfDoc = new int[len][];
        this.curDocNo = 0;
        //this.termsOfFIBO = fiboTerms ;
    }

    /**
     * Constructor used when indexing directory is a RAM memory directory, We
     * need RAM directory because Wso2-Stratoes Server that we used to host
     * dosen't allow access local files
     *
     * @param files- List of Documents converted in to bytes
     * @param docNames -Corresponding Document names
     */
    public DocIndexer(String docContent[], String docNames[]) {
        this.docNames = docNames;

        //this.bufPathToIndex= new RandomAccessBuffer() ;
        this.ramMemDir = new RAMDirectory();
        //pathToIndex = new RAMDirectory().toString();;//this.bufPathToIndex.toString() ;
        // this.files = files;
        this.filesInText = docContent;
        //this.queryDocIndex = queryDocIndex ;
        int len = filesInText.length;
        this.noOfWordsOfDOc = new int[len];
        this.ArrLstSentencesOfDoc = new ArrayList[len];
        this.noOfSentencesOfDoc = new int[len];
        this.removedTermsOfDOc = new String[len][];
        this.freqAfterRemovalOfDoc = new int[len][];
        this.curDocNo = 0;
        //this.termsOfFIBO = fiboTerms ;
    }

    /**
     * Constructor used when indexing directory is a RAM memory directory, We
     * need RAM directory because Stratoes Server dosen't allow access local
     * files. And this uses FIBO and taxomony terms for the indexing
     *
     * @param files- List of Documents converted in to bytes
     * @param docNames -Corresponding Document names
     * @param fiboTermList- Corresponding FIBO term List
     * @param taxoTermList -Corresponding Taxomony term List
     */
    public DocIndexer(byte files[][], String docNames[], String fiboTermList[], String taxoTermList[]) {
        this.docNames = docNames;
        this.fiboTermList = fiboTermList;
        this.taxoTermList = taxoTermList;
        //this.bufPathToIndex= new RandomAccessBuffer() ;
        this.ramMemDir = new RAMDirectory();
        // pathToIndex = new RAMDirectory().toString();;//this.bufPathToIndex.toString() ;
        this.files = files;
        //this.queryDocIndex = queryDocIndex ;
        int len = files.length;
        this.noOfWordsOfDOc = new int[len];
        this.ArrLstSentencesOfDoc = new ArrayList[len];
        this.noOfSentencesOfDoc = new int[len];
        this.removedTermsOfDOc = new String[len][];
        this.freqAfterRemovalOfDoc = new int[len][];
        this.curDocNo = 0;
        //this.termsOfFIBO = fiboTerms ;
    }

    /**
     * Method Used to covert PDF documents in to Text, Because Lucene Can't
     * Process PDF documents directly. This method uses Apache PDFBox Library,
     * and it uses RAM buffer for parsing. Documents have to be initialize using
     * constructor before using this method
     *
     * @param docNo- give the corresponding document ID, (ID selected from the
     * order of the documents Initialized)
     * @return- converted text of the corresponding PDF file as a String
     * @throws IOException
     */

    /*
    public String convertPDFToText(int docNo) throws IOException {
        
    PDFParser parser;
    String parsedText;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
        
    try {
        
        // parser.parse();
        // cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        RandomAccessBuffer tempMemBuffer = new RandomAccessBuffer(); //curDocNo++ ;
        pdDoc = PDDocument.load(new ByteArrayInputStream(files[docNo]), tempMemBuffer);
        //pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
        
        
        //System.out.println("An exception occured in parsing the PDF Document.");
    } catch (Exception e) {
        // System.out.println("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        
        return null;
    } finally {
        if (cosDoc != null) {
            cosDoc.close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }
    }
        
        
        
        
    if (parsedText == null) {
        
        parsedText = "PDF to Text Conversion failed";
        
    }
        
        
    return parsedText;
        
    }
    */
    /**
     * This method used to read the text file when running the program locally
     *
     * @param fileName- full file path of the document
     * @return text of the corresponding document as a String
     */
    public String ReadTextFile(String fileName) {

        String fileasString = null;
        try {

            String strLine;
            // Open the file that is the first 
            // command line parameter
            FileInputStream fstream = new FileInputStream(fileName);
            // Get the object of DataInputStream
            DataInputStream in = new DataInputStream(fstream);
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            //String strLine;
            //Read File Line By Line

            int h = 0;
            while ((strLine = br.readLine()) != null) {
                //  while ((strLine = br.readLine()) != null && h<100) {
                // Print the content on the console
                // System.out.println(strLine);
                fileasString = fileasString + strLine;
                h++;
            }
            //Close the input stream
            in.close();
        } catch (Exception e) {//Catch exception if any
            System.err.println("Error: " + e.getMessage());
        }

        return fileasString;
    }

    /**
     * Count the number of words in a given String
     *
     * @param line- Input String
     * @return - number of words in the input String
     */
    private int wordCount(String line) {
        int numWords = 0;
        int index = 0;
        boolean prevWhiteSpace = true;
        while (index < line.length()) {
            char c = line.charAt(index++);
            boolean currWhiteSpace = Character.isWhitespace(c);
            if (prevWhiteSpace && !currWhiteSpace) {
                numWords++;
            }
            prevWhiteSpace = currWhiteSpace;
        }
        return numWords;
    }

    /**
     * Method to index the documents only using the content of the document
     * "docid" field is used for indexing, since Lucene Dosen't retrieve the
     * documents in the indexed order RAM directory is used for indexing
     *
     * @param docNo- document number of the document to be indexed
     * @throws IOException
     */
    public void index(int docNo) throws IOException {
        //String content = convertPDFToText(docNo);
        //String content = ReadTextFile(fileNames[docNo]);
        //String b = new DefaultTokenizer().processText(content);
        // this.noOfWordsOfDOc[curDocNo] = wordCount(content);
        //StringReader strRdElt = new StringReader(content);
        StringReader strRdElt = new StringReader(new DefaultTokenizer().processText(filesInText[docNo]));
        StringReader docId = new StringReader(Integer.toString(docNo));

        Document doc = new Document();

        doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
        doc.add(new Field("docid", docId, Field.TermVector.YES));
        //  doc.add(new Field(docNames ;
        //this.ArrLstSentencesOfDoc[curDocNo] = sentenceCount(content);
        //this.noOfSentencesOfDoc[curDocNo] = this.ArrLstSentencesOfDoc[curDocNo].size() ;
        IndexWriter iW;
        try {
            //NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex)) ;
            //dir = new RAMDirectory() ;
            //iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_35,
            //new StandardAnalyzer(Version.LUCENE_35)));
            iW = new IndexWriter(ramMemDir,
                    new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
            iW.addDocument(doc);
            iW.close();
            //dir.close() ;
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Method to index the documents only using the content of the document
     * "docid" field is used for indexing, since Lucene Dosen't retrieve the
     * documents in the indexed order RAM directory is used for indexing It
     * index the taxonomy and FIBO terms as separate fields
     *
     * @param docNo- document number of the document to be indexed
     * @throws IOException
     * @throws ClassNotFoundException
     */

    /*
    public void indexWithTaxoFibo(int docNo) throws IOException, ClassNotFoundException {
    String content = convertPDFToText(docNo);
    //String content = ReadTextFile(fileNames[docNo]);
        
        
    this.noOfWordsOfDOc[docNo] = wordCount(content);
    StringReader strRdElt = new StringReader(new DefaultTokenizer().processText(content));
    StringReader docId = new StringReader(Integer.toString(docNo));
        
    // String fibo_taxo = "putable";
        
        
        
        
    Document doc = new Document();
        
    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
    doc.add(new Field("docid", docId, Field.TermVector.YES));
    doc.add(new Field("fiboterms", fiboTermList[docNo], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    doc.add(new Field("taxoterms", taxoTermList[docNo], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
        
    //  doc.add(new Field(docNames ;
    //this.ArrLstSentencesOfDoc[curDocNo] = sentenceCount(content);
    //this.noOfSentencesOfDoc[curDocNo] = this.ArrLstSentencesOfDoc[curDocNo].size() ;
    IndexWriter iW;
    try {
        //NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex)) ;
        //dir = new RAMDirectory() ;
    //iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_35,
        // new StandardAnalyzer(Version.LUCENE_35)));
        iW = new IndexWriter(ramMemDir, new IndexWriterConfig(Version.LUCENE_35,
                new StandardAnalyzer(Version.LUCENE_35)));
        iW.addDocument(doc);
        iW.close();
        //dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    }
    */
    /**
     * Method to index the documents only using the content of the document
     * "docid" field is used for indexing, since Lucene Dosen't retrieve the
     * documents in the indexed order RAM directory is used for indexing
     * Indexing with Lemmatization of terms
     *
     * @param docNo
     * @param tagger
     * @throws IOException
     * @throws ClassNotFoundException
     */

    /*
    public void indexWithSPPOSTagger(int docNo, MaxentTagger tagger) throws IOException, ClassNotFoundException {
    String content = convertPDFToText(docNo);
    //String content = ReadTextFile(fileNames[docNo]);
    String lemmatizedText = analyze(new DefaultTokenizer().processText(content), tagger);
        
    //this.noOfWordsOfDOc[docNo] = wordCount(content);
    //StringReader strRdElt = new StringReader(content);
    this.noOfWordsOfDOc[docNo] = wordCount(lemmatizedText);
    StringReader strRdElt = new StringReader(lemmatizedText);
        
    StringReader docId = new StringReader(Integer.toString(docNo));
        
    Document doc = new Document();
        
    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
    doc.add(new Field("docid", docId, Field.TermVector.YES));
        
        
        
        
    // Initialize the tagger
    // MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");
        
    // The sample string
    //String sample = "This is a sample text";
        
    // The tagged string
    // String tagged = tagger.tagString(sample);
        
    // EnglishLemmaAnalyzer englemaanalyzer= new EnglishLemmaAnalyzer(tagger);
        
        
    IndexWriter iW;
    try {
        
        iW = new IndexWriter(ramMemDir, new IndexWriterConfig(Version.LUCENE_35,
                new StandardAnalyzer(Version.LUCENE_35)));
        
        //iW = new IndexWriter(ramMemDir, new IndexWriterConfig(Version.LUCENE_35,englemaanalyzer));
        iW.addDocument(doc);
        iW.close();
        //dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    }
    */
    /**
     * This method handles the Lemmatization of given text using
     * EnglishLemmaAnalyzer
     *
     * @param text
     * @param tagger- should supply a Stanford parser "MaxentTagger" object
     * @return- Lemmatized text
     * @throws IOException
     * @throws ClassNotFoundException
     */
    public String analyze(String text, MaxentTagger tagger) throws IOException, ClassNotFoundException {
        // System.out.println("Analzying "" + text + """);
        //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");

        Analyzer analyzer = new EnglishLemmaAnalyzer(tagger);
        //System.out.println("\t" + analyzer.getClass().getName() + ":");
        //System.out.print("\t\t");
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
        String term = null;
        while (stream.incrementToken()) {
            // stream.
            if (stream.incrementToken()) {
                term = term + " " + termAttribute.term();
                //                Token token = stream.next();
                //                if (token == null) break;

                // System.out.print("[" + term + "] \n");
            }
        }
        stream.clearAttributes();
        //System.out.println("\n");
        return term;
    }

    /**
     * This method calculates the TF-IDF score for each terms in the indexed
     * documents
     *
     * @param numberOfDocs
     * @return - Hashmap of TF-IDF score per each term in document wise
     * @throws CorruptIndexException
     * @throws ParseException
     */
    public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

        int noOfDocs = docNames.length;

        HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
        //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

        try {

            //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ;
            IndexReader re = IndexReader.open(ramMemDir);

            int i = 0;
            for (int k = 0; k < numberOfDocs; k++) {
                int freq[];
                TermFreqVector termsFreq;
                TermFreqVector termsFreqDocId;
                //TermFreqVector termsFreq3[];
                HashMap<String, Float> wordMap = new HashMap<String, Float>();
                String terms[];
                float score[] = null;

                //termsFreq3=re.getTermFreqVectors(currentDocID);
                termsFreq = re.getTermFreqVector(k, "doccontent");
                termsFreqDocId = re.getTermFreqVector(k, "docid");

                int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
                freq = termsFreq.getTermFrequencies();

                terms = termsFreq.getTerms();

                int noOfTerms = terms.length;
                score = new float[noOfTerms];
                DefaultSimilarity simi = new DefaultSimilarity();
                for (i = 0; i < noOfTerms; i++) {
                    int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                    // System.out.println(terms[i]+"\t"+freq[i]);
                    //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                    float tf = simi.tf(freq[i]);
                    float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                    wordMap.put(terms[i], (tf * idf));

                }
                scoreMap.put(aInt, wordMap);
            }

        } catch (IOException e) {
            // score = null;
            e.printStackTrace();
        }

        //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 

        return scoreMap;
    }

    /**
     * This method calculates the TF-IDF score for each terms including markedup
     * Taxonoomy and FIBO terms, in the indexed documents
     *
     * @param numberOfDocs
     * @param weight- higher weight can be given for FIBO and Taxomony terms
     * with preference
     * @return - Hashmap of TF-IDF score per each term in document wise
     * @throws CorruptIndexException
     * @throws ParseException
     */
    public HashMap<Integer, HashMap> tfIdfScoreWithMarkUpTerms(int numberOfDocs, int weight)
            throws CorruptIndexException, ParseException {

        int noOfDocs = docNames.length;

        HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
        //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

        try {

            //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ;
            IndexReader re = IndexReader.open(ramMemDir);

            int i = 0;
            for (int k = 0; k < numberOfDocs; k++) {
                int aInt = 0;
                //TermFreqVector termsFreqVec[];
                TermFreqVector termsFreq;
                TermFreqVector termsFreqDocId = null;
                TermFreqVector termsFreqFiboTerm;
                TermFreqVector termsFreqTaxoTerm;
                HashMap<String, Float> wordMap = new HashMap<String, Float>();
                String termsVec[][];
                int freqVec[];
                int noOfTermsVec[];
                String terms[];
                int freq[];
                int noOfTerms;
                float score[] = null;

                //termsFreq3=re.getTermFreqVectors(currentDocID);
                /*
                 * getting the fields in the indexed order, Doccontent, docid,
                 * fiboterms
                 */

                //termsFreqVec = re.getTermFreqVectors(k);
                DefaultSimilarity simi = new DefaultSimilarity();
                for (int m = 0; m < 4; m++) {
                    switch (m) {
                    case 0: //doc content
                        termsFreq = re.getTermFreqVector(k, "doccontent");
                        //  freq = termsFreqVec[0].getTermFrequencies();
                        // terms = termsFreqVec[0].getTerms();
                        freq = termsFreq.getTermFrequencies();
                        terms = termsFreq.getTerms();
                        noOfTerms = terms.length;
                        score = new float[noOfTerms];

                        for (i = 0; i < noOfTerms; i++) {
                            int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));

                            float tf = simi.tf(freq[i]);
                            float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                            wordMap.put(terms[i], (tf * idf));

                        }

                        break;
                    case 1: // doc Id
                        termsFreqDocId = re.getTermFreqVector(k, "docid");
                        // terms = termsFreqVec[1].getTerms();
                        aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
                        break;
                    case 2: //Fiboterms
                        termsFreqFiboTerm = re.getTermFreqVector(k, "fiboterms");
                        if (termsFreqFiboTerm != null) {
                            freq = termsFreqFiboTerm.getTermFrequencies();
                            terms = termsFreqFiboTerm.getTerms();

                            noOfTerms = terms.length;

                            score = new float[noOfTerms];
                            //DefaultSimilarity simi = new DefaultSimilarity();
                            for (i = 0; i < noOfTerms; i++) {
                                int noofDocsContainTerm = re.docFreq(new Term("fiboterms", terms[i]));

                                float tf = simi.tf(freq[i]);
                                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                                wordMap.put(terms[i], (tf * idf * weight));

                            }
                        }
                        break;
                    case 3: //taxoterms
                        termsFreqTaxoTerm = re.getTermFreqVector(k, "taxoterms");
                        if (termsFreqTaxoTerm != null) {
                            freq = termsFreqTaxoTerm.getTermFrequencies();
                            terms = termsFreqTaxoTerm.getTerms();

                            noOfTerms = terms.length;

                            score = new float[noOfTerms];
                            //DefaultSimilarity simi = new DefaultSimilarity();
                            for (i = 0; i < noOfTerms; i++) {
                                int noofDocsContainTerm = re.docFreq(new Term("taxoterms", terms[i]));

                                float tf = simi.tf(freq[i]);
                                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                                wordMap.put(terms[i], (tf * idf * weight));
                            }
                        }
                        break;

                    default:
                        //System.out.println("Invalid Entry!");
                    }
                }

                scoreMap.put(aInt, wordMap);
            }

        } catch (IOException e) {
            // score = null;
            e.printStackTrace();
        }

        //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 

        return scoreMap;
    }

    /**
     * This method handles indexding documents in several ways(normal index,
     * index with FIBO and Taxo etc..) and the Similarity calculation using
     * different methods(Cosine, Jaccard, Okapi)
     *
     * @param queryIndex
     * @return -array of Simialrity Scores(Double)
     * @throws IOException
     * @throws CorruptIndexException
     * @throws ParseException
     * @throws ClassNotFoundException
     */
    public double[] consineSimilarityTo(int queryIndex)
            throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
        int noOfDocs = docNames.length;
        float tfIdfScore[][] = new float[noOfDocs][];
        //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
        HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
        //scoreMap.ge
        /*
         * doing all indexing at once
         */

        /*
         * creating the MaxentTagger object- path to tagger file should be
         * supplied (File can be downloded from Stanford parser website)
         */
        //MaxentTagger tagger = new MaxentTagger("tagger/left3words-wsj-0-18.tagger");
        //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");

        for (int i = 0; i < noOfDocs; i++) {
            index(i);
            // indexWithTaxoFibo(i);
            //indexWithSPPOSTagger(i,tagger);
        }

        //        if (!scoreMap.isEmpty()){
        //            scoreMap.clear();
        //        }
        //scoreMap = tfIdfScore(noOfDocs);

        int weight = 1;
        scoreMap = tfIdfScoreWithMarkUpTerms(noOfDocs, weight);
        //        for(int o=0;o<scoreMap.size();o++){
        //            float test[]=scoreMap.get(o);
        //            for (int n=0;n<test.length;n++){
        //                System.out.println(test[n]+"\n");
        //                
        //            }
        //        }

        double sim[] = new double[noOfDocs];
        //ArrayList<Double> simi = new ArrayList<Double>();
        /*
         * calculating cosine similarity
         *
         */

        for (int i = 0; i < noOfDocs; i++) {
            sim[i] = Similarity.getCosineSimilarity(scoreMap.get(queryIndex), scoreMap.get(i));
            //simi.add(Similarity.getCosineSimilarity(scoreMap.get(queryIndex), scoreMap.get(i)));
            //simi.add(Similarity.getJaccardSimilarity(scoreMap.get(queryIndex), scoreMap.get(i)));
        }

        /*
         * //calculating the factor of * int j; for (int i = 0; i <noOfDocs;
         * i++) { for( j=i;j<noOfDocs;j++){
         *
         * //simi.add(Similarity.getCosineSimilarity(scoreMap.get(i),
         * scoreMap.get(j)));
         * simi.add(Similarity.getJaccardSimilarity(scoreMap.get(i),
         * scoreMap.get(j)));
         *
         *
         * }
         * }
         *
         */

        return sim;
    }

    //---------------------------Okapi Testing-------------------------------------------------------------------
    public HashMap<Integer, HashMap> getTfForDocs(int numberOfDocs, int weight)
            throws CorruptIndexException, ParseException {

        int noOfDocs = numberOfDocs;

        HashMap<Integer, HashMap> tfMap = new HashMap<Integer, HashMap>();
        //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

        try {

            //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ;
            IndexReader re = IndexReader.open(ramMemDir);

            int i = 0;
            for (int k = 0; k < numberOfDocs; k++) {
                int aInt = 0;
                //TermFreqVector termsFreqVec[];
                TermFreqVector termsFreq;
                TermFreqVector termsFreqDocId = null;
                TermFreqVector termsFreqFiboTerm;
                TermFreqVector termsFreqTaxoTerm;
                HashMap<String, Integer> wordMap = new HashMap<String, Integer>();
                String termsVec[][];
                int freqVec[];
                int noOfTermsVec[];
                String terms[];
                int freq[];
                int noOfTerms;
                float score[] = null;

                //termsFreq3=re.getTermFreqVectors(currentDocID);
                /*
                 * getting the fields in the indexed order, Doccontent, docid,
                 * fiboterms
                 */

                //termsFreqVec = re.getTermFreqVectors(k);
                DefaultSimilarity simi = new DefaultSimilarity();
                for (int m = 0; m < 2; m++) {
                    switch (m) {
                    case 0: //doc content
                        termsFreq = re.getTermFreqVector(k, "doccontent");
                        //  freq = termsFreqVec[0].getTermFrequencies();
                        // terms = termsFreqVec[0].getTerms();
                        freq = termsFreq.getTermFrequencies();
                        terms = termsFreq.getTerms();
                        noOfTerms = terms.length;
                        score = new float[noOfTerms];

                        for (i = 0; i < noOfTerms; i++) {

                            wordMap.put(terms[i], freq[i]);

                        }

                        break;
                    case 1: // doc Id
                        termsFreqDocId = re.getTermFreqVector(k, "docid");
                        // terms = termsFreqVec[1].getTerms();
                        aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
                        break;

                    /*
                     * case 2: //Fiboterms termsFreqFiboTerm =
                     * re.getTermFreqVector(k, "fiboterms"); if
                     * (termsFreqFiboTerm != null) { freq =
                     * termsFreqFiboTerm.getTermFrequencies(); terms =
                     * termsFreqFiboTerm.getTerms();
                     *
                     * noOfTerms = terms.length;
                     *
                     * score = new float[noOfTerms]; //DefaultSimilarity
                     * simi = new DefaultSimilarity(); for (i = 0; i <
                     * noOfTerms; i++) {
                     *
                     * wordMap.put(terms[i], freq[i]);
                     *
                     * }
                     * }
                     * break; case 3: //taxoterms termsFreqTaxoTerm =
                     * re.getTermFreqVector(k, "taxoterms"); if
                     * (termsFreqTaxoTerm != null) { freq =
                     * termsFreqTaxoTerm.getTermFrequencies(); terms =
                     * termsFreqTaxoTerm.getTerms();
                     *
                     * noOfTerms = terms.length;
                     *
                     * score = new float[noOfTerms]; //DefaultSimilarity
                     * simi = new DefaultSimilarity(); for (i = 0; i <
                     * noOfTerms; i++) {
                     *
                     * wordMap.put(terms[i], freq[i]); } } break;
                     */
                    default:
                        //System.out.println("Invalid Entry!");
                    }
                }
                tfMap.put(aInt, wordMap);
            }

        } catch (IOException e) {
            // score = null;
            e.printStackTrace();
        }

        return tfMap;
    }

    public HashMap<Integer, TreeMap> topKFiboTermsTest(int noOfDocSections, String[] selectedDocuments,
            double okapiCutOff) {
        int noOfDocs = docNames.length;

        for (int i = 0; i < noOfDocs; i++) {
            try {
                index(i);
                // indexWithSPPOSTagger(i,tagger);
                // indexWithSPPOSTagger(i,tagger);
            } catch (IOException ex) {
                ex.printStackTrace();
                Logger.getLogger(DocIndexer.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        HashMap<Integer, TreeMap> topKTerms = new HashMap<Integer, TreeMap>();

        return topKTerms;
    }

    public HashMap<Integer, TreeMap> topKFiboTerms(int noOfDocSections, String[] selectedDocuments,
            double okapiCutOff)
            throws IOException, CorruptIndexException, ParseException, ClassNotFoundException, Exception {
        int noOfDocs = docNames.length;
        float tfIdfScore[][] = new float[noOfDocs][];

        //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

        /*
         * @param scoreMap - what contains here????
         */
        HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
        //scoreMap.ge
        /*
         * doing all indexing at once
         */
        //  MaxentTagger tagger = new MaxentTagger("tagger/left3words-wsj-0-18.tagger");
        //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");

        for (int i = 0; i < noOfDocs; i++) {
            index(i);
            //indexWithTaxoFibo(i);
            // indexWithSPPOSTagger(i,tagger);
        }

        //        if (!scoreMap.isEmpty()){
        //            scoreMap.clear();
        //        }

        int weight = 1;

        //  scoreMap = tfIdfScoreWithMarkUpTerms(noOfDocs,weight);
        //        for(int o=0;o<scoreMap.size();o++){
        //            float test[]=scoreMap.get(o);
        //            for (int n=0;n<test.length;n++){
        //                System.out.println(test[n]+"\n");
        //                
        //            }
        //        }

        /*
         * calculating Okapi similarity
         *
         */

        //score map specific for Okapi Sim

        ///////////////////////////////////////////////////////////////////////////////
        scoreMap = getTfForDocs(noOfDocs, weight);

        /////////////////////////////////////////////////////////////////////////////////

        /*
         * printing average document length
         */

        /*
         * int docsectionWordCount=0; int fidosectionWordCount=0;
         * HashMap<String, Integer> wordMap; for(int k=0;k<noOfDocSections;k++){
         * wordMap=scoreMap.get(k); for(Map.Entry entry: wordMap.entrySet()){
         * docsectionWordCount=docsectionWordCount+(Integer)entry.getValue(); }
         *
         * }
         *
         * System.out.println("Average words per
         * docSection="+docsectionWordCount/noOfDocSections ); for(int
         * k=noOfDocSections;k<noOfDocs;k++){ wordMap=scoreMap.get(k);
         * for(Map.Entry entry: wordMap.entrySet()){
         * fidosectionWordCount=fidosectionWordCount+(Integer)entry.getValue();
         * }
         *
         * }
         *
         * System.out.println("Average words per fibo
         * docs="+fidosectionWordCount/(noOfDocs-noOfDocSections) );
         */
        ArrayList<Double> simi = new ArrayList<Double>();

        OkapiSimilarity okapiSim = new OkapiSimilarity(ramMemDir);

        //----------------------------------------------------------------------------
        /*
         * Store <Section ID,TopKterms>
         */
        //HashMap<Integer, TreeMap> topKTerms = new HashMap<Integer, TreeMap>();
        HashMap<Integer, TreeMap> topKTerms = new HashMap<Integer, TreeMap>();

        for (int p = 0; p < noOfDocSections; p++) {
            /////////////////////////////////////////////////////////
            int noOfFiles;

            Double db[] = new Double[noOfDocs];

            // System.out.println("\n" + docList[p] + "\n");

            double sim[];
            // ArrayList<Double> simi;
            HashMap<String, Double> termsScore = new HashMap<String, Double>();

            ///////////////////////////////////////////////////////////////////////

            try {
                //sim = doc.consineSimilarityTo(0);
                //sim=docInd.consineSimilarityTo(0);
                // simi = docInd.consineSimilarityTo2(p);
                db = okapiSim.computeSimilarity(scoreMap, p);
                simi.addAll(Arrays.asList(db));

                int aa = 0;
                //Printing the similarity values
                for (int i = noOfDocSections; i < simi.size(); i++) {
                    aa++;
                    double temp = simi.get(i);
                    //
                    //                    if (Double.isNaN(temp)) {
                    //                        System.out.println(0.0);
                    //                    } else {
                    //                        System.out.println(temp);
                    //                    }

                    // if (!Double.isNaN(temp) && temp > 10.274) {
                    if (!Double.isNaN(temp) && temp > okapiCutOff) {
                        termsScore.put(docNames[i], temp);
                    }

                    // System.out.print(simi.get(i) + "\n");
                }

                ValueComparator bvc = new ValueComparator(termsScore);
                //TreeMap<String, Double> sorted_map = new TreeMap<String, Double>(bvc);

                SortedMap<String, Double> sorted_map = Collections
                        .synchronizedSortedMap(new TreeMap<String, Double>(bvc));

                // System.out.println("unsorted map: "+termsScore);

                sorted_map.putAll(termsScore);
                //if the number of elements are greater than 15 remove eccess elements.
                //TreeMap<String, Double> tempSortedMap = new TreeMap<String, Double>() ;
                int count = 0;
                Iterator it = sorted_map.entrySet().iterator();
                while (it.hasNext()) {
                    it.next();
                    count++;
                    if (count > 14) {
                        // it.
                        //Entry item =      (Entry) it.next();
                        it.remove();
                    }
                }

                topKTerms.put(Integer.parseInt(selectedDocuments[p]), new TreeMap<String, Double>(sorted_map));
                //tempSortedMap.clear();

            } catch (IOException e) {
                sim = null;
                e.printStackTrace();
            }

            simi.clear();
        }

        //------------------------------------------------------------------------------

        return topKTerms;

    }
}