WARCParser.java :  » Search » galagosearch » org » galagosearch » core » parse » Java Open Source

Java Open Source » Search » galagosearch 
galagosearch » org » galagosearch » core » parse » WARCParser.java
// BSD License (http://www.galagosearch.org/license)
/*
 * WARC record parser
 * 
 * Originally written by:
 *   mhoy@cs.cmu.edu (Mark J. Hoy)
 * 
 * Modified for Galagosearch by:
 *   sjh
 */ 

package org.galagosearch.core.parse;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

public class WARCParser implements DocumentStreamParser {
  private DataInputStream reader = null;
  private WARCRecord fileHeader = null;
  private long recordCount = 0;
  private long totalNumBytesRead = 0;

  public WARCParser( BufferedInputStream stream ) throws IOException {
    reader = new DataInputStream( stream );
    fileHeader = WARCRecord.readNextWarcRecord( reader );
  }


  public void close() throws IOException {
    reader.close();
    reader = null;
  }

  public Document nextDocument() throws IOException {

    WARCRecord record = WARCRecord.readNextWarcRecord( reader );

    if (record == null){
      return null;
    }

    totalNumBytesRead += (long) record.getTotalRecordLength();

    Document doc = new Document( record.getDocid(), record.getContent() );
    doc.metadata = record.warcHeader.metadata;
    doc.metadata.put("url", record.getHeaderMetadataItem("WARC-Target-URI") );      

    return doc;
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.