FindIO.TextIndex.java Source code

Introduction

Here is the source code for FindIO.TextIndex.java
Source

package FindIO;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;

import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;

import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;

/** Index all text files under a directory.
 */
public class TextIndex extends Index {

    private File indexFile;
    private MMapDirectory MMapDir;
    private IndexWriterConfig config;
    private IndexReader indexReader;
    private AtomicReader areader;
    private String fieldname1 = "tag";
    private String fieldname2 = "img_freq";

    private Field tag_field;
    private Field img_field;

    private TextAnalyzer textAnalyzer;

    // Maximum Buffer Size
    private int MAX_BUFF = 48;

    // to create the TextField for vector insertion
    private StringBuffer strbuf;
    // to create the data_field
    private StringBuffer databuf;

    private FileInputStream binIn;

    public TextIndex() {
        setIndexfile("./src/FindIO/index/textIndex");
    }

    public void setIndexfile(String indexfilename) {
        this.indexFile = new File(indexfilename);
        try {
            this.textAnalyzer = new TextAnalyzer();
        } catch (IOException e) {
            System.out.println(Common.MESSAGE_TEXT_ANALYZER_ERROR);
        }
        if (test) {
            System.out.println("The Index File is set: " + indexfilename);
        }
    }

    /* Main Function For Indexing */
    public static void main(String[] args) {
        TextIndex textIndex = new TextIndex();
        try {
            textIndex.initBuilding();
            textIndex.buildIndex("./src/FindIO/Datasets/train/image_tags.txt");
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }

    /**
     * Initialization for building the index
     *
     * @throws Throwable
     * */
    public void initBuilding() throws Throwable {
        startbuilding_time = System.currentTimeMillis();

        // PayloadAnalyzer to map the Lucene id and Doc id
        Analyzer analyzer = new StandardAnalyzer();
        // MMap
        MMapDir = new MMapDirectory(indexFile);
        // set the configuration of index writer
        config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
        config.setRAMBufferSizeMB(MAX_BUFF);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        // the index configuration
        if (test) {
            System.out.println("Max Docs Num:\t" + config.getMaxBufferedDocs());
            System.out.println("RAM Buffer Size:\t" + config.getRAMBufferSizeMB());
            System.out.println("Max Merge Policy:\t" + config.getMergePolicy());
        }
        // use Memory Map to store the index
        MMwriter = new IndexWriter(MMapDir, config);

        tag_field = new TextField(this.fieldname1, "-1", Field.Store.YES);
        img_field = new TextField(this.fieldname2, "-1", Field.Store.YES);

        strbuf = new StringBuffer();
        databuf = new StringBuffer();
        initbuilding_time = System.currentTimeMillis() - startbuilding_time;
    }

    /* Build the index */
    public void buildIndex(String dataFile) throws Throwable {

        BufferedReader reader = new BufferedReader(new FileReader(dataFile));
        HashMap<String, ArrayList<FindIOPair>> tagImgMap = new HashMap<String, ArrayList<FindIOPair>>();
        String line = null;

        //add the image frequency pair to the tag posting list
        while ((line = reader.readLine()) != null) {
            String[] img_tags = line.split(" ");
            String imgID = Common.removeExtension(img_tags[0]);
            for (int i = 1; i < img_tags.length; i++) {
                String tag = img_tags[i];
                FindIOPair image_freq = new FindIOPair(imgID, 1);

                if (!tagImgMap.containsKey(tag)) {
                    ArrayList<FindIOPair> imgPairList = new ArrayList<FindIOPair>();
                    imgPairList.add(image_freq);
                    tagImgMap.put(tag, imgPairList);
                } else {
                    ArrayList<FindIOPair> imgPairList = tagImgMap.get(tag);
                    imgPairList.add(image_freq);
                }
            }
        }

        for (String tag : tagImgMap.keySet()) {
            ArrayList<FindIOPair> imgPairList = tagImgMap.get(tag);
            addDoc(tag, imgPairList);
            index_count++;
        }
        System.out.println("Number of index: " + index_count);
        closeWriter();
        reader.close();
    }

    /**
     * Add a document. The document contains two fields: one is the element id,
     * the other is the values on each dimension
     *
     * @param tag: tag as the key of inverted index
     * @param imgPairList: the posting list containing image pairs
     * */
    public void addDoc(String tag, ArrayList<FindIOPair> imgPairList) {

        Document doc = new Document();
        // clear the StringBuffer
        strbuf.setLength(0);
        // set new Text for payload analyzer
        long start = System.currentTimeMillis();
        for (int i = 0; i < imgPairList.size(); i++) {
            FindIOPair imgPair = imgPairList.get(i);
            strbuf.append(imgPair.getID() + " " + imgPair.getValue() + " ");
        }
        strbuf_time += (System.currentTimeMillis() - start);

        // set fields for document
        this.img_field.setStringValue(strbuf.toString().trim());
        this.tag_field.setStringValue(this.textAnalyzer.getStem(tag));
        doc.add(tag_field);
        doc.add(img_field);

        try {
            MMwriter.addDocument(doc);
            System.out.println(Common.MESSAGE_FILE_INDEX_SUCCESS + tag);
        } catch (IOException e) {
            System.err.println(Common.MESSAGE_TEXT_INDEX_ERROR);
            if (test) {
                e.printStackTrace();
            }
        }
    }

    public Map<String, double[]> searchText(String queryString) throws Exception {
        List<String> terms = Arrays.asList(queryString.trim().split(" "));

        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
        IndexSearcher searcher = new IndexSearcher(reader);
        // :Post-Release-Update-Version.LUCENE_XY:
        Analyzer analyzer = new StandardAnalyzer();

        BufferedReader in = null;
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));

        // :Post-Release-Update-Version.LUCENE_XY:
        QueryParser parser = new QueryParser(fieldname1, analyzer);

        Query query = parser.parse(queryString);
        if (test)
            System.out.println("Searching for text: " + query.toString(fieldname1));

        TopDocs topDocs;
        if (test) { // repeat & time as benchmark
            long start = System.currentTimeMillis();
            topDocs = searcher.search(query, null, Common.topK);
            long end = System.currentTimeMillis();
            System.out.println("Time: " + (end - start) + " ms");
        } else {
            topDocs = searcher.search(query, null, Common.topK);
        }

        ScoreDoc[] hits = topDocs.scoreDocs;

        Map<String, double[]> mapResults = new HashMap<String, double[]>();
        //print out the top hits documents
        for (ScoreDoc hit : hits) {
            Document doc = searcher.doc(hit.doc);
            String tag = doc.get(fieldname1);
            int index = terms.indexOf(tag);
            if (index == -1) {
                continue;
            }
            String[] images = doc.get(fieldname2).split("\\s+");
            for (int i = 0; i < images.length; i += 2) {
                String imageName = images[i];
                String freq = images[i + 1];
                if (mapResults.get(imageName) == null) {
                    mapResults.put(imageName, new double[terms.size()]);
                }
                double[] docTerms = mapResults.get(imageName);
                docTerms[index] = Double.parseDouble(freq);
            }
        }
        reader.close();

        return mapResults;
    }

    /**
     * update score mainly used for relevance feedback, the input should be stemmed
     * @param imageID
     * @param tag_score_pairs
     * @throws Throwable
     */
    public void updateScore(String imageID, List<FindIOPair> tag_score_pairs) throws Throwable {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
        IndexSearcher searcher = new IndexSearcher(reader);
        // :Post-Release-Update-Version.LUCENE_XY:
        Analyzer analyzer = new StandardAnalyzer();

        BufferedReader in = null;
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
        // :Post-Release-Update-Version.LUCENE_XY:
        QueryParser parser = new QueryParser(fieldname1, analyzer);

        for (FindIOPair pair : tag_score_pairs) {
            String tag = pair.getID();
            double add_score = pair.getValue();

            Query query = parser.parse(tag);

            System.out.println("Updating Text: " + query.toString(fieldname1));

            TopDocs topDocs;
            if (test) { // repeat & time as benchmark
                long start = System.currentTimeMillis();
                topDocs = searcher.search(query, null, Common.topK);
                long end = System.currentTimeMillis();
                System.out.println("Time: " + (end - start) + " ms");
            } else {
                topDocs = searcher.search(query, null, Common.topK);
            }

            ScoreDoc[] hits = topDocs.scoreDocs;
            if (hits.length == 0) { //It's a new tag
                Document doc = new Document();
                String img_score = imageID + " " + (0.1 * add_score) + " ";
                if (add_score > 0) {
                    // set fields for document
                    this.tag_field.setStringValue(this.textAnalyzer.getStem(tag));
                    this.img_field.setStringValue(img_score);
                    doc.add(tag_field);
                    doc.add(img_field);
                    MMwriter.addDocument(doc);
                }
            } else {
                //The tag is included in the index
                int docId = hits[0].doc;

                //retrieve the old document
                Document doc = searcher.doc(docId);

                //replacement field value
                String currentScores = doc.get(fieldname2);
                String[] img_score_pairs = currentScores.split(" ");
                StringBuilder stringBuilder = new StringBuilder();

                boolean isImageContained = false;

                for (int i = 0; i < img_score_pairs.length; i += 2) {
                    String img = img_score_pairs[i];
                    double old_score = Double.valueOf(img_score_pairs[i + 1]);
                    double new_score = old_score + add_score;
                    if (new_score < 0) {
                        new_score = 0;
                    }
                    String img_score_pair;
                    if (img.equals(imageID)) {
                        img_score_pair = img + " " + new_score + " ";
                        isImageContained = true;
                    } else {
                        img_score_pair = img + " " + old_score + " ";
                    }
                    stringBuilder.append(img_score_pair);
                }

                if (!isImageContained) { //If the image was not covered by the tag, append it to the tail
                    stringBuilder.append(imageID + " " + add_score + " ");
                }

                //remove all occurrences of the old field
                doc.removeFields(fieldname2);

                this.img_field.setStringValue(stringBuilder.toString().trim());
                if (test)
                    System.out.println(stringBuilder.toString());
                //insert the replacement
                doc.add(img_field);
                Term tagTerm = new Term(this.fieldname1, tag);
                MMwriter.updateDocument(tagTerm, doc);
            }

            MMwriter.commit();
        }
        reader.close();
        closeWriter();
    }
}