GeometricIndex.java :  » Search » galagosearch » org » galagosearch » core » index » geometric » Java Open Source

Java Open Source » Search » galagosearch 
galagosearch » org » galagosearch » core » index » geometric » GeometricIndex.java
// BSD License (http://www.galagosearch.org/license)
package org.galagosearch.core.index.geometric;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.galagosearch.core.index.mem.FlushToDisk;
import org.galagosearch.core.index.mem.MemoryIndex;
import org.galagosearch.core.index.mem.MemoryParameters;
import org.galagosearch.core.index.mem.MemoryRetrieval;
import org.galagosearch.core.mergeindex.sequential.MergeSequentialIndexShards;
import org.galagosearch.core.parse.NumberedDocument;
import org.galagosearch.core.retrieval.MultiRetrieval;
import org.galagosearch.core.retrieval.Retrieval;
import org.galagosearch.core.retrieval.structured.DocumentOrderedFeatureFactory;
import org.galagosearch.core.retrieval.structured.StructuredRetrieval;
import org.galagosearch.tupleflow.InputClass;
import org.galagosearch.tupleflow.Parameters;
import org.galagosearch.tupleflow.Processor;
import org.galagosearch.tupleflow.TupleFlowParameters;
import org.galagosearch.tupleflow.Utility;
import org.galagosearch.tupleflow.execution.Verified;


/*
 *  author: sjh, schiu
 *  
 *  Geometric Index Writer updates an in-memory index
 *  periodically the memory index is flushed to disk
 *  depending on the number of indexes on disk, 
 *  some index blocks may then be merged 
 *  
 *  Notes:
 *  document ids are unique throughout the system
 *  merging process should no re-number documents
 *  
 *  indexBlockSize is the number of documents in 
 *  an index block empirically (over trec newswire 
 *  documents), 50000 documents should use between 
 *  500 and 800MB of RAM. Depending on your collection; 
 *  you may want to change this.
 *  
 */
@Verified
@InputClass(className = "org.galagosearch.core.parse.NumberedDocument")
public class GeometricIndex extends MultiRetrieval implements Processor<NumberedDocument> {

  private Logger logger = Logger.getLogger(GeometricIndex.class.toString());
  String shardDirectory;
  long globalDocumentCount;
  int indexBlockSize; // measured in documents
  int indexBlockCount;
  boolean stemming;
  String mergeMode;
  MemoryIndex currentMemoryIndex;
  GeometricPartitions geometricParts;
  Parameters retrievalParameters = new Parameters();
  Parameters statistics;
  private MemoryRetrieval currentMemoryRetrieval;

  public GeometricIndex(TupleFlowParameters parameters) throws Exception {


    //note: need to find some way to pass in retrieval parameters
    //stopgap justfor testing
    super(new HashMap<String, Collection<Retrieval>>(), new Parameters());


    shardDirectory = parameters.getXML().get("directory");
    stemming = (boolean) parameters.getXML().get("stemming", false);
    mergeMode = parameters.getXML().get("mergeMode", "local");

    // 10,000 is a small testing value -- 50,000 is probably a better choice.
    indexBlockSize = (int) parameters.getXML().get("indexBlockSize", 10000);
    // radix is the number of indexes of each size to store before a merge op
    // keep in mind that the total number of indexes is difficult to control
    int radix = (int) parameters.getXML().get("radix", 3);
    geometricParts = new GeometricPartitions(radix);

    // initialisation
    globalDocumentCount = 0;
    indexBlockCount = 0;
    retrievalParameters.add("retrievalGroup", "all");

    resetCurrentMemoryIndex();
    updateRetrieval();
  }

  public void process(NumberedDocument doc) throws IOException {
    currentMemoryIndex.process(doc);
    globalDocumentCount++; // now one higher than the document just processed
    if (globalDocumentCount % indexBlockSize == 0) {
      flushCurrentIndexBlock();
      maintainMergeLocal();
    }
  }

  public void close() throws IOException {
    // this will ensure that all data is on disk
    flushCurrentIndexBlock();

    logger.info("Performing final merge");

    try {
      doMerge(geometricParts.getAllShards(), new File(shardDirectory + File.separator + "final"));
    } catch (IOException ex) {
      Logger.getLogger(GeometricIndex.class.getName()).log(Level.SEVERE, null, ex);
    }
  }

  private void flushCurrentIndexBlock() {
    // First check that the final memory index contains some data:
    if (currentMemoryIndex.getCollectionLength() < 1) {
      return;
    }

    logger.info("Flushing current memory Index. id = " + indexBlockCount);

    // may want to wait for the previous thread to complete
    // but not entirely necessary

    final GeometricIndex g = this;
    final MemoryIndex flushingMemoryIndex = currentMemoryIndex;
    final File shardFolder = getNextIndexShardFolder(1);

    // reset the current index for the next document
    resetCurrentMemoryIndex();

    try {
      // first flush the index to disk
      (new FlushToDisk()).flushMemoryIndex(flushingMemoryIndex, shardFolder.getAbsolutePath(), false);
      // indicate that the flushing part of this thread is done
      
      synchronized (geometricParts) {
      updateRetrieval();
      flushingMemoryIndex.close();

      // add flushed index to the set of bins -- needs to be a synconeous action
      
        geometricParts.add(1, shardFolder.getAbsolutePath());
      }

    } catch (IOException e) {
      logger.severe(e.toString());
    }
  }

  private void maintainMergeLocal() {
    logger.info("Maintaining Merge Local");
    int i = 1;
    // while (i <= geometricParts.getMaxSize()) {
    try {
      mergeIndexShards(i);
    } catch (IOException ex) {
      Logger.getLogger(GeometricIndex.class.getName()).log(Level.SEVERE, null, ex);
    }
  }

  // only one thread can perform a merge at any time.
  // this should stop duplicate merges
  private void mergeIndexShards(int binSize) throws IOException {

    Bin mergeBin = null;
    synchronized (geometricParts) {
      mergeBin = geometricParts.findMergeCandidates(binSize, true);
    }

    if (mergeBin == null) {
      //logger.info("Nothing to merge, returning.");
      return;
    } else {
      doMerge(mergeBin);
    }

  }

  //wrapper that auto-creates the location
  private void doMerge(Bin mergeBin) throws IOException {
    // make a new index shard
    doMerge(mergeBin, getNextIndexShardFolder(mergeBin.size + 1));
  }

  //merges and places the merged index in that location
  private void doMerge(Bin mergeBin, File indexShard) throws IOException {
    // otherwise there's something to merge
    logger.info("Performing merge!");


    // merge the shards
    MergeSequentialIndexShards merger = new MergeSequentialIndexShards(mergeBin.getBinPaths(), indexShard.getAbsolutePath());

    try {
      merger.run(mergeMode);
    } catch (Exception ex) {
      // I just want to see these errors for now...
      Logger.getLogger(GeometricIndex.class.getName()).log(Level.SEVERE, null, ex);
    }

    // should make sure that these two are uninteruppable
    synchronized (geometricParts) {
      geometricParts.add(mergeBin.size + 1, indexShard.getAbsolutePath());
      geometricParts.removeShards(mergeBin);
      updateRetrieval();
    } // now can delete these folders...

    for (String file : mergeBin.getBinPaths()) {
      Utility.deleteDirectory(new File(file));
    } // find the next merge candidate

    logger.info("Done merging.");
  }

  private File getNextIndexShardFolder(int size) {
    File indexFolder = new File(shardDirectory + File.separator + "galagoindex.shard." + indexBlockCount + "." + size);
    indexFolder.mkdirs();
    indexBlockCount++;

    return indexFolder;
  }

  private void resetCurrentMemoryIndex() {
    Parameters parameters = new Parameters();
    parameters.set("firstDocumentId", Long.toString(globalDocumentCount));
    currentMemoryIndex = new MemoryIndex(parameters);
  }

  private void updateRetrieval() throws IOException {
    Collection<Retrieval> r = geometricParts.getAllShards().getAllRetrievals();
    currentMemoryRetrieval = new MemoryRetrieval(currentMemoryIndex, retrievalParameters);
    ArrayList<Retrieval> allRetrievals = new ArrayList<Retrieval>(r.size() + 1);
    allRetrievals.addAll(r);

    ArrayList<Parameters> staticParameters = new ArrayList<Parameters>();
    for(Retrieval ret: r ){
        staticParameters.add(ret.getRetrievalStatistics("all"));
    }

    statistics = new MemoryParameters((MemoryRetrieval)currentMemoryRetrieval, mergeStats(staticParameters));
    statistics.add("retrievalGroup", "all");
    allRetrievals.add(currentMemoryRetrieval);
    retrievals.put("all", allRetrievals);
    initRetrieval();
  }

  //MultiRetrieval modifications
  //simplify, since statistics need to be recompiled everytime.
  private void initRetrieval() throws IOException {
    featureFactories = new HashMap();
    featureFactories.put("all", new DocumentOrderedFeatureFactory(getRetrievalStatistics("all")));

  }

  //since we know all the retrievals will be the same, just return any of them
  public Parameters getAvailableParts(String retGroup) throws IOException {
    return currentMemoryRetrieval.getAvailableParts("all");
  }

  public Parameters getRetrievalStatistics(String retGroup) throws IOException {

    return statistics;

  }
  
  // Sub - Classes
  private class Bin {

    int size;
    private TreeMap<String, Retrieval> binPaths = new TreeMap<String, Retrieval>();

    public Bin(int size) {
      this.size = size;
    }

    public void add(Bin b) {
      binPaths.putAll(b.binPaths);
    }

    public void add(String path) {
      try {
        binPaths.put(path, new StructuredRetrieval(path, retrievalParameters));
      } catch (Exception e) {
        logger.info("Index " + path + " cannot be opened.");
      }

    }

    private void removeAll(Bin b) {
      for (String key : b.getBinPaths()) {
        binPaths.remove(key);
      }
    }

    private int size() {
      return binPaths.size();
    }

    private Collection<String> getBinPaths() {
      return binPaths.keySet();
    }

    public Collection<Retrieval> getAllRetrievals() {
      return binPaths.values();
    }
  }

  private class GeometricPartitions {

    int radix;
    TreeMap<Integer, Bin> radixBins = new TreeMap();

    public GeometricPartitions(int radix) {
      this.radix = radix;
    }

    private Bin get(int size) {
      return radixBins.get(new Integer(size));
    }

    private void add(int size, String path) {
      if (!radixBins.containsKey(size)) {
        radixBins.put(size, new Bin(size));
      }
      radixBins.get(size).add(path);
    }

    // Specifically returns a new Bin with the set of file paths
    // this means that computation can continue as required.
    //
    // If cascade is true, then this will also add larger bin sizes
    // if merging the current bin size will cause the next one to
    // reach radix.
    private Bin findMergeCandidates(int size, boolean cascade) {
      Bin candidate;
      Bin result = new Bin(0);
      for (int i = size; i <= getMaxSize(); i++) {
        candidate = radixBins.get(i);
        if (candidate.size() + ((size == i) ? 0 : 1) >= radix) {
          logger.info("Adding Merge Candidate of size: " + i);
          result.size = i;
          result.add(candidate);
        } else {
          break;
        }
        if (!cascade) {
          break;
        }
      }

      if (result.size() > 0) {
        return result;
      }
      return null;
    }

    private Bin getAllShards() {
      Bin result = new Bin(0);
      for (Integer i : radixBins.keySet()) {
        if (i.intValue() > result.size) {
          result.size = i.intValue();
        }
        result.add(radixBins.get(i));
      }
      return result;
    }

    // only remove merged shards
    private void removeShards(Bin merged) {
      //search all bins and remove.
      for (Integer i : radixBins.keySet()) {
        radixBins.get(i).removeAll(merged);
      }
    }

    private int getMaxSize() {
      int max = 0;
      for (int i : radixBins.keySet()) {
        if (i > max) {
          max = i;
        }
      }
      return max;
    }
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.