BitStreamHPIndexWriter.java :  » Search-Engine » mg4j » it » unimi » dsi » mg4j » index » Java Open Source

Java Open Source » Search Engine » mg4j 
mg4j » it » unimi » dsi » mg4j » index » BitStreamHPIndexWriter.java
package it.unimi.dsi.mg4j.index;

/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 2.1 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */

import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
import it.unimi.dsi.io.NullOutputStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.search.score.VignaScorer;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Map;

/** Writes a bitstream-based high-performance index. The comments about
 * offsets in the documentation of {@link BitStreamIndexWriter} apply here, too.
 *
 * <p>The difference between indices generated by this class and those generated
 * by {@link BitStreamIndexWriter} lie in the level
 * of interleaving. Indices generated by this class have positions in a separate stream (similarly to Lucene), and
 * a compulsory skip structure (an extension of that used by a {@link BitStreamIndexWriter})
 * that indexes both the main index file and the positions file. This can result in major performance
 * improvement in the resolution of position-based operators (e.g., phrases) and in the evaluation
 * of {@linkplain VignaScorer proximity-based scorers}. Since the overhead due to the additional
 * skip structure and to the separate positions stream is negligible, indices generated by
 * this class are the default in MG4J.
 * 
 * <p>Presently, indices generated by this class cannot carry payloads: you must use a {@link BitStreamIndexWriter}
 * in that case. Moreover, only nonparametric indices can be used for positions 
 * (this limitation rules out {@link Coding#GOLOMB}, {@link Coding#SKEWED_GOLOMB}, and {@link Coding#INTERPOLATIVE}).
 * 
 * @author Sebastiano Vigna 
 * @since 1.2
 */


public class BitStreamHPIndexWriter extends AbstractBitStreamIndexWriter implements IndexWriter {
  private static final boolean ASSERTS = false;
  private static final boolean DEBUG = false;
  private static final boolean COOKIES = false;
  
  /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists
   * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */ 
  public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;

  /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
  protected static final int BEFORE_INVERTED_LIST = 0;

  /** This value of {@link #state} means that we are positioned at the start of an inverted list,
   * and we should call {@link #writeFrequency(int)}.*/
  protected static final int BEFORE_FREQUENCY = 1;

  /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
  protected static final int BEFORE_DOCUMENT_RECORD = 2;

  /** This value of {@link #state} means that we just started a new document record, and we
   * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */
  protected static final int BEFORE_POINTER = 3;

  /** This value of {@link #state} can be assumed only in indices that contain payloads; it
   * means that we are positioned just before the payload for the current document record. */
  protected static final int BEFORE_PAYLOAD = 4;

  /** This value of {@link #state} can be assumed only in indices that contain counts; it
   * means that we are positioned just before the count for the current document record. */
  protected static final int BEFORE_COUNT = 5;

  /** This value of {@link #state} can be assumed only in indices that contain document positions; 
   * it means that we are positioned just before the position list of the current document record. */
  protected static final int BEFORE_POSITIONS = 6;

  /** This is the first unused state. Subclasses may start from this value to define new states. */
  protected static final int FIRST_UNUSED_STATE = 7;

  /** The underlying index {@link OutputBitStream}. */
  protected OutputBitStream obs;
  /** The underlying positions {@link OutputBitStream}. */
  protected OutputBitStream positions;
  /** The offset {@link OutputBitStream}. */
  private OutputBitStream offset;
  /** The current state of the writer. */
  protected int state;
  /** The number of document records that the current inverted list will contain. */
  protected int frequency;
  /** The number of document records already written for the current inverted list. */
  protected int writtenDocuments;
  /** The current document pointer. */
  protected int currentDocument;
  /** The last document pointer in the current list. */
  protected int lastDocument;
  /** The position (in bytes) where the last inverted list started. */
  private long lastInvertedListPos;
  /** The parameter <code>b</code> for Golomb coding of pointers. */
  protected int b;
  /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
  protected int log2b;
  /** The maximum number of positions in a document record so far. */
  public int maxCount;
  /** The number of bits written for offsets in the file of positions. */
  public long bitsForPositionsOffsets;
  /** Maximum number of trials when optimising the entry bit length. */
  private final static int MAX_TRY = 32;

  /** The parameter <code>h</code> (the maximum height of a skip tower). */
  private final int h;

  /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */
  private final int q;

  /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */
  private final int w;

  /** The number of document records written in the cache containing the current block. */
  private int cache;

  /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th
   *  skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>
   *  contains the first document pointer within the next block. */
  private final int[] skipPointer;

  /** The {@link OutputBitStream}s where cached document pointers are written. */
  private final OutputBitStream[] cachePointer;

  /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */
  private final FastByteArrayOutputStream[] cachePointerByte;

  /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip
   *  indices. */
  private final OutputBitStream[] cacheSkip;

  /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */
  private final OutputBitStream[] cacheSkipBitCount;

  /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip
  *  indices. */
  private final FastByteArrayOutputStream[] cacheSkipByte;

  /** The {@link OutputBitStream} where cached document data are written. */
  private final CachingOutputBitStream cacheDataOut;

  /** The {@link FastBufferedInputStream} from which cached document data are read. */
  private final FastBufferedInputStream cacheDataIn;

  /** The length of the data segment for each quantum. */
  private final int[] cacheDataLength;

  /** The length of the positions bitstream for each quantum. */
  private final long[] cachePositionsLength;

  /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */
  private final OutputBitStream bitCount;

  /** The sum of all tower data computed so far. */
  public final TowerData towerData;

  /** The number of bits written to the positions stream at the start of the current quantum. */
  private long writtenPositionsBitsAtLastQuantum;
  
  /** The number of bits written for quantum lengths. */
  public long bitsForQuantumBitLengths;

  /** The number of bits written for quantum lengths in the positions stream. */
  public long bitsForPositionsQuantumBitLengths;

  /** The number of bits written for entry lenghts. */
  public long bitsForEntryBitLengths;

  /** The number of written blocks. */
  public long numberOfBlocks;

  /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been
   * written for the current inverted list. */
  public int prevEntryBitLength;

  /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been
   * written for the current inverted list. */
  public int prevQuantumBitLength;

  /** An estimate on the number of bits occupied per quantum in the positions stream in the last written cache, or -1 if no cache has been
   * written for the current inverted list. */
  public int prevPositionsQuantumBitLength;

  /** The Golomb modulus for a top pointer skip, for each level. */
  private final int[] towerTopB;
  
  /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */
  private final int[] towerTopLog2B;
  
  /** The Golomb modulus for a lower pointer skip, for each level. */
  private final int[] towerLowerB;
  
  /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
  private final int[] towerLowerLog2B;
  
  /** The prediction for a pointer skip, for each level. */
  private final int[] pointerPrediction;


  /** The <var>k</var>-th entry of this array contains the number of bits from the start of
   * the <var>k</var>-th skip tower up to the end of the current block (more precisely,
   * to the point that should be reached via skipping, which is just after the document pointer).
   * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)}, 
   * but it is declared here for efficiency.
   */
  final private long[] distance;

  /** The temporary file dumping the index data contained in a block. */
  final private File tempFile;

  
  /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
   *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 
   * 
   * @param basename the basename.
   * @param numberOfDocuments the number of documents in the collection to be indexed.
   * @param writeOffsets if <code>true</code>, the offset file will also be produced.
   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   */
  public BitStreamHPIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets,  int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {
    this( 
      new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ),
      new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.POSITIONS_EXTENSION ) ),
      writeOffsets? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION) ) : null,
      numberOfDocuments,
      tempBufferSize,
      flags, q , h
     );
  }

  /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
   *
   * @param obs the underlying output bit stream.
   * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.
   * @param numberOfDocuments the number of documents in the collection to be indexed.
   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   * @throws IOException 
   */
  public BitStreamHPIndexWriter( final OutputBitStream obs, final OutputBitStream positions, final OutputBitStream offset, final int numberOfDocuments, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {
    super( numberOfDocuments, flags );
    this.obs = obs;
    this.positions = positions;
    this.offset = offset;
    this.frequency = -1;
    this.currentTerm = -1;
    this.maxCount = 0;

    if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" );
    if ( h < 0 ) throw new IllegalArgumentException( "Illegal height " + h );
    if ( q <= 0 || ( q & -q ) != q ) throw new IllegalArgumentException( "Illegal quantum " + q );
    this.h = h;
    this.q = q;

    int two2h = 1 << h;
    w = two2h * q;

    if ( DEBUG ) {
      System.err.println( "Cache will contain at most " + w + " records (q=" + q + ",h=" + h + ")" );
      System.err.print( "Skip records will be " );
      for ( int i = 0; i < two2h; i++ ) System.err.print( ( i * q ) + " " );
      System.err.println();
    }

    towerData = new TowerData();
    tempFile = File.createTempFile( "MG4J", ".data" );
    cacheDataIn = new FastBufferedInputStream( new FileInputStream( tempFile ) );
    cacheDataOut = new CachingOutputBitStream( tempFile, tempBufferSize );
    cacheDataLength = new int[two2h];
    cachePositionsLength = new long[two2h + 1];
    cachePointer = new OutputBitStream[two2h];
    cachePointerByte = new FastByteArrayOutputStream[two2h];
    
    for ( int i = 0; i < two2h; i++ ) 
      cachePointer[i] = new OutputBitStream( cachePointerByte[i] = new FastByteArrayOutputStream(), 0 );

    cacheSkip = new OutputBitStream[two2h];
    cacheSkipBitCount = new OutputBitStream[two2h];
    cacheSkipByte = new FastByteArrayOutputStream[two2h];

    for ( int i = 0; i < two2h; i++ ) {
      cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[i] = new FastByteArrayOutputStream(), 0 );
      cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 );
    }
  
    skipPointer = new int[ two2h + 1 ];
    distance = new long[ two2h + 1 ];

    bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 );

    towerTopB = new int[ h + 1 ];
    towerTopLog2B = new int[ h + 1 ];
    towerLowerB = new int[ h + 1 ];
    towerLowerLog2B = new int[ h + 1 ];
    pointerPrediction = new int[ h + 1 ];

  }


  
  private int writeOutPointer( final OutputBitStream out, final int pointer ) throws IOException {
    if ( frequency == numberOfDocuments ) return 0; // We do not write pointers for everywhere occurring terms.

    switch ( pointerCoding ) {
      case GAMMA:
        return out.writeGamma( pointer - lastDocument - 1 );
      case DELTA:
        return out.writeDelta( pointer - lastDocument - 1 );
      case GOLOMB:
        return out.writeGolomb( pointer - lastDocument - 1, b, log2b );
      default:
        throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
    }
  }


  /** A structure maintaining statistical data about tower construction. */

  public static class TowerData {
    /** The number of bits written for bit skips at the top of a tower. */
    public long bitsForTopBitSkips;

    /** The number of bits written for positions bit skips at the top of a tower. */
    public long bitsForTopPositionsBitSkips;

    /** The number of bits written for skip pointers at the top of a tower. */
    public long bitsForTopSkipPointers;

    /** The number of bits written for bit skips in the lower part of a tower. */
    public long bitsForLowerBitSkips;

    /** The number of bits written for positions bit skips in the lower part of a tower. */
    public long bitsForLowerPositionsBitSkips;

    /** The number of bits written for skip pointers in the lower part of a tower. */
    public long bitsForLowerSkipPointers;

    /** The number of bits written for tower lengths. */
    public long bitsForTowerLengths;

    /** The number of written skip towers. */
    public long numberOfSkipTowers;

    /** The number of written top skip entries. */
    public long numberOfTopEntries;

    /** The number of written lower skip entries. */
    public long numberOfLowerEntries;

    /** Clear all fields of this tower data. */

    void clear() {
      bitsForTopBitSkips = 0;
      bitsForTopPositionsBitSkips = 0;
      bitsForTopSkipPointers = 0;
      bitsForLowerBitSkips = 0;
      bitsForLowerPositionsBitSkips = 0;
      bitsForLowerSkipPointers = 0;
      bitsForTowerLengths = 0;
      numberOfSkipTowers = 0;
      numberOfTopEntries = 0;
      numberOfLowerEntries = 0;
    }


    /** Returns the overall number of bits used for skip pointers.
     * @return the overall number of bits used for skip pointers.
     */
    public long bitsForSkipPointers() { return bitsForTopSkipPointers + bitsForLowerSkipPointers; }

    /** Returns the overall number of bits used for bit skips. 
     * @return the overall number of bits used for bit skips.
     */
    public long bitsForBitSkips() { return bitsForTopBitSkips + bitsForLowerBitSkips; }

    /** Returns the overall number of bits used for bit skips. 
     * @return the overall number of bits used for bit skips.
     */
    public long bitsForPositionsBitSkips() { return bitsForTopPositionsBitSkips + bitsForLowerPositionsBitSkips; }

    /** Returns the overall number of bits used for tower entries (bits for tower lengths are not included).
     * @return the overall number of bits used for tower entries.
     */
    public long bitsForEntries() { return bitsForSkipPointers() + bitsForBitSkips() + bitsForPositionsBitSkips(); }

    /** Returns the overall number of bits used for towers.
     * @return the overall number of bits used for towers.
     */
    public long bitsForTowers() { return bitsForTowerLengths + bitsForEntries(); }

    /** Returns the overall number of entries.
     * @return the overall number of entries.
     */
    public long numberOfEntries() { return numberOfTopEntries + numberOfLowerEntries; }
  }
  
  
  public long newInvertedList() throws IOException {
    if ( cache != 0 ) writeOutCache( -1 );
    if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );
    if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state );

    // The position (in bits) where the new inverted list starts
    long pos = obs.writtenBits();
    // Reset variables
    writtenDocuments = 0;
    currentTerm++;
    currentDocument = -1;

    // If needed, write the offset
    if ( offset != null ) offset.writeLongGamma( pos - lastInvertedListPos );
    // Write the offset for positions
    bitsForPositionsOffsets += obs.writeLongDelta( positions.writtenBits() );
    lastInvertedListPos = pos;
    state = BEFORE_FREQUENCY;
    return pos;
  }

  public int writeFrequency( final int frequency ) throws IOException {
    if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state );

    int bitCount;
    // Write the frequency
    switch( frequencyCoding ) {
    case SHIFTED_GAMMA:
      bitCount = obs.writeShiftedGamma( frequency - 1 ); // frequency cannot be 0
      break;
    case GAMMA:
      bitCount = obs.writeGamma( frequency - 1 ); // frequency cannot be 0
      break;
    case DELTA:
      bitCount = obs.writeDelta( frequency - 1 ); // frequency cannot be 0
      break;
    default:
      throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." );
    }

    this.frequency = frequency;

    // We compute the modulus used for pointer Golomb coding 
    if ( pointerCoding == Coding.GOLOMB ) {
      b = BitStreamIndex.golombModulus( frequency, numberOfDocuments ); 
      log2b = Fast.mostSignificantBit( b );
    }

    prevQuantumBitLength = prevEntryBitLength = prevPositionsQuantumBitLength = -1;  

    if ( DEBUG ) System.err.println( "----------- " + currentTerm + " (" + frequency + ")" );

    final long pointerQuantumSigma = BitStreamIndex.quantumSigma( frequency, numberOfDocuments, q );
    for( int i = Math.min( h, Fast.mostSignificantBit( frequency / q ) ); i >= 0; i-- ) {
      towerTopB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i + 1 );
      towerTopLog2B[ i ] = Fast.mostSignificantBit( towerTopB[ i ] );
      towerLowerB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i );
      towerLowerLog2B[ i ] = Fast.mostSignificantBit( towerLowerB[ i ] );
      pointerPrediction[ i ] = (int)( ( q * ( 1L << i ) * numberOfDocuments + frequency / 2 ) / frequency );
    }
    
    state = BEFORE_DOCUMENT_RECORD;
    bitsForFrequencies += bitCount;
    return bitCount;
  }

  public OutputBitStream newDocumentRecord() throws IOException {
    if ( frequency == writtenDocuments ) throw new IllegalStateException( "Document record overflow (written " + this.frequency + " already)" );
    if ( state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new document record in state " + state );

    writtenDocuments++;
    numberOfPostings++;
    lastDocument = currentDocument;
    state = BEFORE_POINTER;
    return cacheDataOut;
  }

  public int writeDocumentPointer( @SuppressWarnings("unused") final OutputBitStream unused, final int pointer ) throws IOException {
    if ( state != BEFORE_POINTER ) throw new IllegalStateException( "Trying to write pointer in state " + state );

    // If the previous block is over, write it out!

    if ( cache == w ) writeOutCache( pointer );

    final OutputBitStream out;
    
    // Record data pointer if we are on a skip; otherwise, write it to the cache.
    if ( cache % q == 0 ) {
      if ( cache / q > 0 ) {
        cacheDataLength[ cache / q - 1 ] = (int)cacheDataOut.writtenBits();
        if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE;
        cachePositionsLength[ cache / q -1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum );
        writtenPositionsBitsAtLastQuantum = positions.writtenBits();
      }
      cacheDataOut.align();
      cacheDataOut.writtenBits( 0 );
      skipPointer[ cache / q ] = pointer;
      out = cachePointer[ cache++ / q ];
    } 
    else {
      cache++;
      out = cacheDataOut;
    }

    currentDocument = pointer;
    int bitCount = 0;

    if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents.
      switch( pointerCoding ) {
        case SHIFTED_GAMMA:
          bitCount = out.writeShiftedGamma( pointer - lastDocument - 1 );
          break;
        case GAMMA:
          bitCount = out.writeGamma( pointer - lastDocument - 1 );
          break;
        case DELTA:
          bitCount = out.writeDelta( pointer - lastDocument - 1 );
          break;
        case GOLOMB:
          bitCount = out.writeGolomb( pointer - lastDocument - 1, b, log2b );
          break;
        default:
          throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
      }
    }
    else if ( pointer - lastDocument != 1 ) throw new IllegalStateException( "Term " + currentTerm + " has frequency equal to the number of documents, but pointers are not consecutive integers" );

    state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
    bitsForPointers += bitCount;
    return bitCount;
  }

  public int writePayload( final OutputBitStream out, final Payload payload ) throws IOException {
    throw new IllegalStateException( "High-performance indices do not support payloads" );
    /*if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" );
    if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state );
    final int count = payload.write( out );
    bitsForPayloads += count;
    state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
    return count;*/
  }
  
  public int writePositionCount( final OutputBitStream out, final int count ) throws IOException {
    if ( frequency < 0 ) throw new IllegalStateException( "Trying to write count without calling newInvertedList" );
    if ( state != BEFORE_COUNT ) throw new IllegalStateException( "Trying to write count in state " + state );
    final int bitCount;

    numberOfOccurrences += count;
    switch( countCoding ) {
      case SHIFTED_GAMMA:
        bitCount = out.writeShiftedGamma( count - 1 );
        break;
      case GAMMA:
        bitCount = out.writeGamma( count - 1 );
        break;
      case UNARY:
        bitCount = out.writeUnary( count - 1 );
        break;
      case DELTA:
        bitCount = out.writeDelta( count - 1 );
        break;
      default:
        throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
    }
    
    state = hasPositions ? BEFORE_POSITIONS : BEFORE_DOCUMENT_RECORD;
    bitsForCounts += bitCount;
    return bitCount;
  }

  public int writeDocumentPositions( @SuppressWarnings("unused") final OutputBitStream unused, final int[] occ, final int offset, final int len, final int docSize ) throws IOException {
    if ( frequency < 0 ) throw new IllegalStateException( "Trying to write occurrences without calling newInvertedList" );
    if ( state != BEFORE_POSITIONS ) throw new IllegalStateException( "Trying to write positions in state " + state );

    if ( ASSERTS && docSize > 0 ) for( int i = 0; i< len; i++ ) assert occ[ offset + i ] < docSize : "Position " + occ[ offset + i ] + " for document " + currentDocument + " is too large; size is " + docSize;
    
    int i;
    int prev = -1;
    int bitCount = 0;
    final int end = offset + len;
    final OutputBitStream positions = this.positions;
    
    switch( positionCoding ) {
      case GAMMA:
        if ( COOKIES ) bitCount += positions.writeGamma( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case DELTA:
        if ( COOKIES )  bitCount += positions.writeDelta( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeDelta( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case SHIFTED_GAMMA:
        if ( COOKIES ) bitCount += positions.writeShiftedGamma( Integer.MAX_VALUE );
        for( i = offset; i < end; i++ ) {
          bitCount += positions.writeShiftedGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      default:
        throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );
    }

    state = BEFORE_DOCUMENT_RECORD;
    bitsForPositions += bitCount;
    if ( len > maxCount ) maxCount = len;
    return bitCount;  
  }

  
  public void close() throws IOException {
    if ( cache != 0 ) writeOutCache( -1 );
    
    if ( state != BEFORE_DOCUMENT_RECORD && state != BEFORE_INVERTED_LIST ) throw new IllegalStateException( "Trying to close index in state " + state );
    if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );

    if ( writtenBits() != obs.writtenBits() + positions.writtenBits() ) 
      throw new IllegalStateException( "Written bits count mismatch: we say " + writtenBits() + ", the streams say " + ( obs.writtenBits() + positions.writtenBits() ) );

    if ( offset != null ) {
      offset.writeLongGamma( obs.writtenBits() - lastInvertedListPos );
      offset.close();
    }

    obs.close();
    positions.close();
    cacheDataIn.close();
    cacheDataOut.close();
    tempFile.delete();
  }
  

  /** Computes the towers.
   * 
   * @param quantumBitLength the length in bits of a quantum.
   * @param positionsQuantumBitLength the length in bits of a quantum in the positions stream.
   * @param entryBitLength the estimated length in bits of a tower entry.
   * @param toTheEnd the number of bits that must be skipped to reach the next tower (usually,
   * the length of the first pointer of the next block or 0 if this is to be the last block).
   * @param skip an array of output bit stream where the data related to each tower will be written.
   * @param towerData will be filled with statistical date about the towers.
   * @param doinIt if true, we are actually writing a tower, not just trying.
   */
  private void tryTower( final int quantumBitLength, final int positionsQuantumBitLength, final int entryBitLength, long toTheEnd, final OutputBitStream[] skip, final TowerData towerData, final boolean doinIt ) throws IOException {
    int i, k, s;
    long d;
    int basePointer;
    // truncated is true only for those towers (in defective blocks) whose height is strictly smaller than the height they should have
  
    boolean truncated = false;

    if ( DEBUG && doinIt ) System.err.println( "Writing out tower for term " + currentTerm + "; quantumBitLength=" + quantumBitLength + " entryBitLength=" + entryBitLength );
    
    for ( k = ( cache - 1 ) / q; k >= 0; k-- ) {
      // Where are we? At the end of the k-th quantum. So toTheEnd must be increased by
      // the length of the data contained in the same quantum, moving us...
      toTheEnd += cacheDataLength[ k ];

      // ...just after the k-th skip tower.
      // We compute the maximum valid index of the skip tower (*MUST* be kept in sync with the subsequent loop).
      s = ( k == 0 ) ? h : Fast.leastSignificantBit( k );

      // This test handles defective blocks. In particular, for defective quanta s=-1,
      // yielding no skipping data at all for such quanta. truncated is true if the
      // current tower is truncated w.r.t. the infinite skip list.
      if ( cache < w ) {
        final int upperBound = Fast.mostSignificantBit( ( cache / q ) - k );
        if ( s > upperBound ) {
          s = upperBound;
          truncated = true;
        } else truncated = false;
      }
      else truncated = k == 0;
      
      skip[ k ].writtenBits( 0 );

      if ( s >= 0 ) {
        if ( DEBUG && doinIt ) System.err.print( "% (" + k + ") [" + skipPointer[ k ] + "] " );

        basePointer = skipPointer[ k ];

        /* If the current tower is truncated, we must actually write the top of the tower.
         * The top must be forecast in a Bernoullian way: we write it as a difference from the average pointer skip, 
         * which is q 2^s / relativeFrequency. */
        if ( truncated ) {
          towerData.numberOfTopEntries++;
          // TODO: prediction should be based not on 1<<s, but rather on the actual number of skipped quanta, which could be smaller (because of end-of-list)
          towerData.bitsForTopSkipPointers += skip[k].writeGolomb( Fast.int2nat( skipPointer[ k + ( 1 << s ) ] - basePointer - pointerPrediction[ s ] ), towerTopB[ s ], towerTopLog2B[ s ] );
          towerData.bitsForTopBitSkips += skip[k].writeLongDelta( Fast.int2nat( 
            (int) ( ( toTheEnd - distance[k + ( 1 << s )] ) -
                ( quantumBitLength * ( 1 << s ) + entryBitLength * ( ( 1 << s + 1 ) - s - 2 ) ) ) )
          );
          towerData.bitsForTopPositionsBitSkips += skip[k].writeLongDelta( 
              Fast.int2nat( ( cachePositionsLength[ k ] - cachePositionsLength[ k + ( 1 << s ) ] ) - positionsQuantumBitLength * ( 1 << s ) ) );
        }
        
        if ( DEBUG && doinIt ) System.err.print( ( truncated ? "" : "(" ) + ( skipPointer[ k + ( 1 << s ) ] - basePointer ) + ":" + ( toTheEnd - distance[k + ( 1 << s )] ) + ( truncated ? " " : ") " ) );

        // Produce a (single) tower of height s
        for ( i = s - 1; i >= 0; i-- ) {
          towerData.bitsForLowerSkipPointers += skip[k].writeGolomb( 
            Fast.int2nat( ( skipPointer[k + ( 1 << i )] - basePointer ) - ( ( skipPointer[k + ( 1 << i + 1 )] - basePointer ) / 2 ) ),
            towerLowerB[ i ], towerLowerLog2B[ i ] 
          );

          towerData.bitsForLowerBitSkips += skip[k].writeLongDelta( 
            Fast.int2nat( (int) ( ( ( toTheEnd - distance[k + ( 1 << ( i + 1 ) )] - entryBitLength * ( i + 1 ) ) / 2 ) -
                ( toTheEnd - distance[k + ( 1 << i )] ) ) ) 
          );
          towerData.bitsForLowerPositionsBitSkips += skip[k].writeLongDelta( 
              Fast.int2nat( ( cachePositionsLength[ k ] - cachePositionsLength[k + ( 1 << i + 1 )] ) / 2 -
                  ( cachePositionsLength[ k ] - cachePositionsLength[k + ( 1 << i )] ) ) 
            );

          if ( DEBUG && doinIt ) System.err.print( ( skipPointer[k + ( 1 << i )] - basePointer ) + ":" + ( toTheEnd - distance[k + ( 1 << i )] ) + " " );
        }

        if ( s > 0 ) { // No length for single-entry towers.
          d = bitCount.writeDelta( Fast.int2nat( (int) skip[k].writtenBits() - ( s + 1 ) * entryBitLength ) );
          towerData.bitsForTowerLengths += d;
          toTheEnd += d;
        }

        toTheEnd += skip[k].writtenBits();

        if ( DEBUG && doinIt ) System.err.print( " (" + (int) skip[k].writtenBits() + " bits)" );

        towerData.numberOfLowerEntries += s;
        towerData.numberOfSkipTowers++;

        if ( DEBUG && doinIt ) System.err.println();
      }

      distance[ k ] = toTheEnd;

      // Where are we? Just before the beginning of the k-th skip tower
      toTheEnd += cachePointer[ k ].writtenBits();

      // Where are we? Just before the beginning of the k-th document record
    }
  }

  /** Write out the cache content.
   * 
   * @param nextPointer the first pointer of the next block, or -1 if this is the last block.
   */
  private void writeOutCache( final int nextPointer ) throws IOException {
    if ( DEBUG ) System.err.println( "Entered writeOutCache() with cache=" + cache + " (H is " + ( 1 << h ) + ", B is " + w + ")" );

    cacheDataLength[ ( cache + q - 1 ) / q - 1 ] = (int)cacheDataOut.writtenBits();
    if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE;
    cachePositionsLength[ ( cache + q - 1 ) / q - 1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum );
    cachePositionsLength[ ( cache + q - 1 ) / q ] = 0;
    writtenPositionsBitsAtLastQuantum = positions.writtenBits();
    
    /* We cumulate the position lengths so to obtain the actual skips. */
    for( int i = ( cache + q - 1 ) / q; i-- != 0; ) cachePositionsLength[ i ] += cachePositionsLength[ i + 1 ];
    
    //System.err.print( basename ); for( int i = 0; i < ( cache + q - 1 ) / q; i++ ) System.err.print( " " + cachePositionsLength[ i ] ); System.err.println();
    
    /* Number of bits to go after the first pointer of the first record of the next block (or, if there
       is no other block in the current list, to go to the end of the list). */
    long toTheEnd;

    // Record the new document pointer for the highest tower
    int nextAfter = ( ( cache + q ) - 1 ) / q; // This is ceil( cache / q )

    if ( nextPointer >= 0 ) {
      skipPointer[nextAfter] = nextPointer;
      toTheEnd = writeOutPointer( bitCount, nextPointer );
    } else {
      skipPointer[nextAfter] = currentDocument + 1; // Fake: just for the last block
      toTheEnd = 0;
    }

    distance[nextAfter] = 0;

    int k, s;
    long d;

    // Compute quantum length in bits (without towers)
    int quantumBitLength = 0, entryBitLength = 0, positionsQuantumBitLength = (int)( ( cachePositionsLength[ 0 ] * q + ( cache -1 ) ) / cache );

    for ( d = k = 0; k <= ( ( cache - 1 ) / q ); k++ ) d += ( cachePointer[k].writtenBits() + cacheDataLength[ k ] );
    quantumBitLength = (int)( ( ( d * q ) + ( cache - 1 ) ) / cache );

    final TowerData td = new TowerData();
    final Int2IntRBTreeMap candidates = new Int2IntRBTreeMap(); 

    /* As a first try, we compute the tower costs using 0 as average entry bit length. */
    tryTower( quantumBitLength, positionsQuantumBitLength, 0, toTheEnd, cacheSkipBitCount, td, false );
    
    if ( td.numberOfSkipTowers > 0 ) { // There actually is at least a tower.
      /* Now we repeat this operation, trying to obtain the best value for the
       * average entry bit length. 
       */

      while( candidates.size() < MAX_TRY && ! candidates.containsValue( entryBitLength = (int)( td.bitsForTowers() / td.numberOfEntries() ) ) ) {
        td.clear();
        tryTower( quantumBitLength, positionsQuantumBitLength, entryBitLength, toTheEnd, cacheSkipBitCount, td, false );
        candidates.put( (int)( td.bitsForTowers() / td.numberOfEntries() ), entryBitLength );
      }

      if ( ASSERTS ) assert candidates.size() < MAX_TRY;

      entryBitLength = candidates.get( candidates.firstIntKey() );

      if ( DEBUG ) System.err.println( "Going to write tower at position " + obs.writtenBits() );
      tryTower( quantumBitLength, positionsQuantumBitLength, entryBitLength, toTheEnd, cacheSkip, towerData, true );
    }

    // Ready to write out cache
    int maxCacheDataLength = 0;
    for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) if ( cacheDataLength[ k ] > maxCacheDataLength ) maxCacheDataLength = cacheDataLength[ k ];  
    
    /* We have two ways of writing out cached data. If all the data is still in the output bit
     * stream buffer, we just read it directly. Otherwise, we have to pour it into a temporary buffer. */
    
    final byte[] buffer;
    final boolean direct;
    int pos = 0;
    
    cacheDataOut.align();

    if ( cacheDataOut.buffer() != null ) {
      buffer = cacheDataOut.buffer();
      direct = true;
    }
    else {
      cacheDataOut.flush();
      buffer = new byte[ ( maxCacheDataLength + 7 ) / 8 ];
      direct = false;
      cacheDataIn.flush();
      cacheDataIn.position( 0 );
    }
    
    for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) {

      /* See comments above. */
      s = ( k == 0 ) ? h : Fast.leastSignificantBit( k );

      if ( cache < w ) s = Math.min( s, Fast.mostSignificantBit( ( cache / q ) - k ) );

      d = cachePointer[k].writtenBits();
      cachePointer[k].flush();
      obs.write( cachePointerByte[k].array, d );

      d = cacheSkip[k].writtenBits();
      cacheSkip[k].flush();

      if ( s >= 0 ) {
        if ( k == 0 ) {
          if ( prevQuantumBitLength < 0 ) {
            bitsForQuantumBitLengths += obs.writeLongDelta( quantumBitLength );
            bitsForPositionsQuantumBitLengths += obs.writeLongDelta( positionsQuantumBitLength );
            bitsForEntryBitLengths += obs.writeLongDelta( entryBitLength );
          }
          else {
            bitsForQuantumBitLengths += obs.writeLongDelta( Fast.int2nat( quantumBitLength - prevQuantumBitLength ) );
            bitsForPositionsQuantumBitLengths += obs.writeLongDelta( Fast.int2nat( positionsQuantumBitLength - prevPositionsQuantumBitLength ) );
            bitsForEntryBitLengths += obs.writeLongDelta( Fast.int2nat( entryBitLength - prevEntryBitLength ) );
          }

          prevQuantumBitLength = quantumBitLength;
          prevPositionsQuantumBitLength = positionsQuantumBitLength;
          prevEntryBitLength = entryBitLength;

          numberOfBlocks++;
        }

        if ( s > 0 ) obs.writeDelta( Fast.int2nat( (int)d - entryBitLength * ( s + 1 ) ) ); // No length for single-entry towers.
      } else if ( ASSERTS ) assert d == 0;

      obs.write( cacheSkipByte[k].array, d );
      
      if ( direct ) {
        obs.write( buffer, pos * 8, cacheDataLength[ k ] );
        pos += ( cacheDataLength[ k ] + 7 ) / 8;
      }
      else {
        cacheDataIn.read( buffer, 0, ( cacheDataLength[ k ] + 7 ) / 8 );
        obs.write( buffer, cacheDataLength[ k ] );
      }
    }

    // Clean used caches
    for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) {
      cachePointerByte[k].reset();
      cachePointer[k].writtenBits( 0 );

      cacheSkipByte[k].reset();
      cacheSkip[k].writtenBits( 0 );

      cacheDataOut.position( 0 );
      cacheDataOut.writtenBits( 0 );
    }

    cache = 0;

    if ( ASSERTS ) assert obs.writtenBits() + positions.writtenBits() == writtenBits();
  }


  public long writtenBits() {
    return bitsForFrequencies + bitsForPointers + bitsForPayloads + bitsForCounts + bitsForPositions + bitsForPositionsOffsets + 
    towerData.bitsForTowers() + bitsForQuantumBitLengths + bitsForPositionsQuantumBitLengths + bitsForEntryBitLengths;
  }

  public Properties properties() {
    Properties result = new Properties();
    result.setProperty( Index.PropertyKeys.DOCUMENTS, numberOfDocuments );
    result.setProperty( Index.PropertyKeys.TERMS, currentTerm + 1 );
    result.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings );
    result.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
    result.setProperty( Index.PropertyKeys.INDEXCLASS, FileHPIndex.class.getName() );
    result.setProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM, q );
    result.setProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT, h );
    if ( COOKIES ) result.setProperty( "cookies", true );
    // We save all flags, except for the PAYLOAD component, which is just used internally.
    for( Map.Entry<Component,Coding> e: flags.entrySet() )
      if ( e.getKey() != Component.PAYLOADS ) result.addProperty( Index.PropertyKeys.CODING, new MutableString().append( e.getKey() ).append( ':' ).append( e.getValue() ) );
    return result;
  }

  public void printStats( final PrintStream stats ) {
    super.printStats( stats );
    stats.println( "Skip towers: " + Util.format( towerData.numberOfSkipTowers ) + " (" + 
        Util.format( towerData.bitsForTowers() ) + " bits [" + 
        Util.format( towerData.bitsForTowers()*100.0/writtenBits() ) + "%], " + 
        Util.format( towerData.bitsForTowers()/ (double)towerData.numberOfSkipTowers ) + " bits/tower)" );
    stats.println( "Skip entries: " + Util.format( towerData.numberOfEntries() ) + " (" + 
        Util.format( towerData.bitsForEntries() / (double)towerData.numberOfEntries()) + " bits/entry)" );
    // Note that lengths are written approximately every other tower.
    stats.println( "Skip tower lengths: " + Util.format( towerData.bitsForTowerLengths ) + " bits (" + Util.format( 2.0 * towerData.bitsForTowerLengths/ towerData.numberOfSkipTowers ) + " bits/tower)" );
    stats.println( "Quantum bit lengths: " + Util.format( bitsForQuantumBitLengths ) + " bits (" + Util.format( bitsForQuantumBitLengths/ (double)numberOfBlocks ) + " bits/block)" );
    stats.println( "Positions quantum bit lengths: " + Util.format( bitsForPositionsQuantumBitLengths ) + " bits (" + Util.format( bitsForPositionsQuantumBitLengths / (double)numberOfBlocks ) + " bits/block)" );
    stats.println( "Entry bit lengths: " + Util.format(bitsForEntryBitLengths ) + " bits (" + Util.format( bitsForEntryBitLengths/ (double)numberOfBlocks ) + " bits/block)" );
    
    stats.println( "Top bit skips: " + Util.format(towerData.bitsForTopBitSkips ) + " bits (" + Util.format( towerData.bitsForTopBitSkips / (double)towerData.numberOfTopEntries ) + " bits/skip)" );
    stats.println( "Top positions bit skips: " + Util.format(towerData.bitsForTopPositionsBitSkips ) + " bits (" + Util.format( towerData.bitsForTopPositionsBitSkips / (double)towerData.numberOfTopEntries ) + " bits/skip)" );
    stats.println( "Top pointer skips: " + Util.format( towerData.bitsForTopSkipPointers ) + " bits (" + Util.format( towerData.bitsForTopSkipPointers/ (double)towerData.numberOfTopEntries ) + " bits/skip)" );
    stats.println( "Lower bit skips: " + Util.format( towerData.bitsForLowerBitSkips ) + " bits (" + Util.format( towerData.bitsForLowerBitSkips/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );
    stats.println( "Lower positions bit skips: " + Util.format( towerData.bitsForLowerPositionsBitSkips ) + " bits (" + Util.format( towerData.bitsForLowerPositionsBitSkips/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );
    stats.println( "Lower pointer skips: " + Util.format( towerData.bitsForLowerSkipPointers ) + " bits (" + Util.format( towerData.bitsForLowerSkipPointers/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );
    stats.println( "Bit skips: " + Util.format( towerData.bitsForBitSkips() ) + " bits (" + Util.format( towerData.bitsForBitSkips()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );
    stats.println( "Positions bit skips: " + Util.format( towerData.bitsForPositionsBitSkips() ) + " bits (" + Util.format( towerData.bitsForPositionsBitSkips()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );
    stats.println( "Pointer skips: " + Util.format( towerData.bitsForSkipPointers() ) + " bits (" + Util.format( towerData.bitsForSkipPointers()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.