org.terrier.indexing.WARC018Collection.java Source code

Java tutorial

Introduction

Here is the source code for org.terrier.indexing.WARC018Collection.java

Source

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org/
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is WARC018Collection.java
 *
 * The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
 */
package org.terrier.indexing;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.CharEncoding;
import org.apache.log4j.Logger;
import org.terrier.indexing.tokenisation.Tokeniser;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Files;
import org.terrier.utility.FixedSizeInputStream;
import org.terrier.utility.StringTools;

/**
 * This object is used to parse WARC format web crawls, 0.18. 
 * The precise {@link Document} class to be used can be specified with the
 * <tt>trec.document.class</tt> property.
 * 
 * <p>
 * <b>Properties</b>
 * <ul>
 * <li><tt>trec.document.class</tt> the {@link Document} class to parse individual documents (defaults to {@link TaggedDocument}).</li>
 * <li><tt>warc018collection.force.utf8</tt> - should UTF8 encoding be assumed throughout. Defaults to false.</li>
 * <li><tt>warc018collection.header.docno</tt> - what header has the thing to be used as docno? Defaults to warc-trec-id.</li>
 * <li><tt>warc018collection.header.url</tt> - what header has the thing to be used as url? Defaults to warc-target-url.</li>
 * </ul>
 * @author Craig Macdonald
 */
public class WARC018Collection implements Collection {
    /** logger for this class */
    protected static final Logger logger = Logger.getLogger(WARC018Collection.class);
    /** Counts the number of documents that have been found in this file. */
    protected int documentsInThisFile = 0;
    /** are we at the end of the collection? */
    protected boolean eoc = false;
    /** has the end of the current input file been reached? */
    protected boolean eof = false;
    /** the input stream of the current input file */
    protected InputStream is = null;
    /** the length of the blob containing the document data */
    protected long currentDocumentBlobLength = 0;
    /** properties for the current document */
    protected Map<String, String> DocProperties = null;
    /** The list of files to process. */
    protected ArrayList<String> FilesToProcess = new ArrayList<String>();
    /** The index in the FilesToProcess of the currently processed file.*/
    protected int FileNumber = 0;
    /** should UTF8 encoding be assumed? */
    protected final boolean forceUTF8 = Boolean
            .parseBoolean(ApplicationSetup.getProperty("warc018collection.force.utf8", "false"));
    /** what header for the docno document metadata */
    protected final String warc_docno_header = ApplicationSetup
            .getProperty("warc018collection.header.docno", "warc-trec-id").toLowerCase();
    /** what header for the url document metadata */
    protected final String warc_url_header = ApplicationSetup
            .getProperty("warc018collection.header.url", "warc-target-uri").toLowerCase();
    /** what header for the crawldate document metadata */
    protected final String warc_crawldate_header = ApplicationSetup
            .getProperty("warc018collection.header.crawldate", "date").toLowerCase();
    /** how to parse WARC date formats */
    final static SimpleDateFormat dateWARC = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");

    /** Encoding to be used to open all files. */
    protected String desiredEncoding = ApplicationSetup.getProperty("trec.encoding",
            Charset.defaultCharset().name());
    /** Class to use for all documents parsed by this class */
    protected Class<? extends Document> documentClass;
    /** Tokeniser to use for all documents parsed by this class */
    protected Tokeniser tokeniser = Tokeniser.getTokeniser();

    /** default constructor for this collection object. Reads files from the system
      * default collection.spec file */
    public WARC018Collection() {
        this(ApplicationSetup.COLLECTION_SPEC);
    }

    /** construct a collection from the denoted collection.spec file */
    public WARC018Collection(final String CollectionSpecFilename) {
        readCollectionSpec(CollectionSpecFilename);
        loadDocumentClass();
        try {
            openNextFile();
        } catch (IOException ioe) {
            logger.error("Problem opening first file ", ioe);
        }
    }

    /**
      * A constructor that reads only the specificed InputStream.*/
    public WARC018Collection(InputStream input) {
        is = input;
        loadDocumentClass();
    }

    /**
     * Check whether it is the last document in the collection
     * @return boolean
     */
    public boolean hasNext() {
        return !endOfCollection();
    }

    /**
     * Return the next document
     * @return next document
     */
    public Document next() {
        nextDocument();
        return getDocument();
    }

    /** 
     * This is unsupported by this Collection implementation, and
     * any calls will throw UnsupportedOperationException
     * @throws UnsupportedOperationException on all invocations */
    //   @Override 
    //   public void remove()
    //   {
    //      throw new UnsupportedOperationException("Iterator.remove() not supported");
    //   }

    /** Closes the collection, any files that may be open. */
    public void close() {
        try {
            is.close();
        } catch (IOException ioe) {
            logger.warn("Problem closing collection", ioe);
        }
    }

    /** Returns true if the end of the collection has been reached */
    public boolean endOfCollection() {
        return eoc;
    }

    /** Get the String document identifier of the current document. */
    public String getDocid() {
        return DocProperties.get("docno");
    }

    /** Loads the class that will supply all documents for this Collection.
     * Set by property <tt>trec.document.class</tt>
     */
    protected void loadDocumentClass() {
        try {
            documentClass = Class
                    .forName(ApplicationSetup.getProperty("trec.document.class", TaggedDocument.class.getName()))
                    .asSubclass(Document.class);
        } catch (Exception e) {
            throw new IllegalArgumentException(e);
        }
    }

    /** Get the document object representing the current document. */
    public Document getDocument() {
        FixedSizeInputStream fsis = new FixedSizeInputStream(is, currentDocumentBlobLength);
        fsis.suppressClose();
        Document rtr;
        try {
            rtr = documentClass.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(fsis,
                    DocProperties, tokeniser);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        return rtr;
        //      String charset = DocProperties.get("charset");
        //      Reader r;
        //      if (charset == null)
        //      {
        //         r = new InputStreamReader(fsis);
        //      }
        //      else
        //      {
        //         try{
        //            charset = StringTools.normaliseEncoding(charset);
        //            logger.debug("Using "+ charset + " to decode "+ DocProperties.get("docno"));
        //            r = new InputStreamReader(fsis, charset);
        //         } catch (java.io.UnsupportedEncodingException uee) {
        //            logger.warn("Encoding "+charset+ " is unrecognised, resorting to system default");
        //                r = new InputStreamReader(fsis);
        //         } catch (Exception e) {
        //            logger.warn("Problem reading documents, perhaps encoding "+charset+ " is unrecognised, trying to read with system default encoding", e);
        //            r = new InputStreamReader(fsis);
        //         }
        //      }   
        //      return new TaggedDocument(r, DocProperties, tokeniser);
    }

    protected int parseHeaders(final boolean requireContentLength) throws IOException {
        int headerSize = 0;
        boolean foundContentLength = false;
        while (true) {
            final String followLine = readLine();
            final int len = followLine.length();
            headerSize += readLineByteCount;
            if (len == 0) {
                if ((!requireContentLength) || (requireContentLength && foundContentLength))
                    break;
            }
            final int colonIndex = followLine.indexOf(':');
            if (colonIndex < 0) {
                continue;
            }
            final String key = followLine.substring(0, colonIndex).trim().toLowerCase();
            final String value = followLine.substring(colonIndex + 1, len).trim();
            DocProperties.put(key, value);
            if (key.equals("content-length"))
                foundContentLength = true;
        }
        if (requireContentLength)
            assert foundContentLength;
        //System.err.println("Header length was " + headerSize);
        return headerSize;
    }

    /** Move the collection to the start of the next document. */
    public boolean nextDocument() {
        DocProperties = new HashMap<String, String>(15);
        try {
            warcrecord: while (true) {
                String line = readLine();
                //logger.debug("Checking "+line + " for the magic warc header");
                //look for a warc line
                if (line.startsWith("WARC/0.18")) {
                    //logger.debug("Found warc header");
                    int headerSize = parseHeaders(true);
                    //logger.debug("Parsed WARC headers in "+ headerSize + " bytes");
                    final long warc_response_length = Long.parseLong(DocProperties.get("content-length"));
                    //logger.debug("length of http message is "+warc_response_length);
                    if (!DocProperties.get("warc-type").equals("response")) {
                        is.skip(warc_response_length);
                        //-49
                        continue warcrecord;
                    }
                    headerSize = parseHeaders(false);
                    //logger.debug("Parsed HTTP headers in "+ headerSize + " bytes");
                    DocProperties.put("docno", DocProperties.get(warc_docno_header));
                    DocProperties.put("url", DocProperties.get(warc_url_header));
                    DocProperties.put("crawldate", parseDate(DocProperties.get(warc_crawldate_header)));
                    if (logger.isDebugEnabled())
                        logger.debug("Now working on document " + DocProperties.get("docno"));

                    DocProperties.put("charset", desiredEncoding);
                    //obtain the character set of the document and put in the charset property
                    String cType = DocProperties.get("content-type");
                    //force UTF-8 for english documents - webpage isnt clear:
                    //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings
                    if (cType != null) {
                        cType = cType.toLowerCase();
                        if (cType.contains("charset")) {
                            final Matcher m = charsetMatchPattern.matcher(cType);
                            if (m.find() && m.groupCount() > 0) {
                                String charset = StringTools.normaliseEncoding(m.group(1));
                                if (CharEncoding.isSupported(charset))
                                    DocProperties.put("charset", charset);
                            }
                        }
                    }
                    if (forceUTF8)
                        DocProperties.put("charset", "utf-8");
                    documentsInThisFile++;
                    currentDocumentBlobLength = warc_response_length - headerSize; //-16
                    return true;
                }
                if (eof) {
                    if (documentsInThisFile == 0) {
                        logger.warn(this.getClass().getSimpleName() + " found no documents in "
                                + FilesToProcess.get(FileNumber - 1) + ". "
                                + "Perhaps trec.collection.class is wrongly set or decompression failed.");
                    }
                    if (!openNextFile())
                        return false;
                }
            }
        } catch (IOException ioe) {
            logger.error("IOException while reading WARC format collection file" + ioe);
        }
        return false;
    }

    static final Pattern charsetMatchPattern = Pattern.compile("charset[:=]\\s*['\"]?([0-9a-zA-Z_\\-]+)['\"]?");

    int readLineByteCount;

    /** read a line from the currently open InputStream is */
    protected String readLine() throws IOException {
        final StringBuilder s = new StringBuilder();
        int c = 0;
        char ch;
        char ch2;
        readLineByteCount = 0;
        while (true) {
            c = is.read();
            readLineByteCount++;
            if (c == -1) {
                eof = true;
                break;
            }
            ch = (char) c;
            if (ch == '\r') {
                c = is.read();
                readLineByteCount++;
                if (c == -1) {
                    s.append(ch);
                    eof = true;
                    break;
                }
                ch2 = (char) c;
                if (ch2 == '\n')
                    break;
                s.append(ch);
                s.append(ch2);
            } else if (ch == '\n') {
                break;
            } else {
                s.append(ch);
            }
        }
        return s.toString();
    }

    /**
     * Opens the next document from the collection specification.
     * @return boolean true if the file was opened successufully. If there
     *      are no more files to open, it returns false.
     * @throws IOException if there is an exception while opening the
     *      collection files.
     */
    protected boolean openNextFile() throws IOException {
        //try to close the currently open file
        if (is != null)
            try {
                is.close();
            } catch (IOException ioe) {
                logger.warn("IOException while closing file being read", ioe);
            }
        //keep trying files
        boolean tryFile = true;
        //return value for this fn
        boolean rtr = false;
        while (tryFile) {
            if (FileNumber < FilesToProcess.size()) {
                //SkipFile = true;
                String filename = (String) FilesToProcess.get(FileNumber);
                FileNumber++;
                //check the filename is sane
                if (!Files.exists(filename)) {
                    logger.warn("Could not open " + filename + " : File Not Found");
                } else if (!Files.canRead(filename)) {
                    logger.warn("Could not open " + filename + " : Cannot read");
                } else {//filename seems ok, open it
                        //if (filename.toLowerCase().endsWith(".gz"))
                        //{
                        /* WARC format files have multiple compressed records. JDK one can't deal with this
                         * See: http://crawler.archive.org/apidocs/index.html?org/archive/io/arc/ARCWriter.html
                         * We get around this by using an external zcat process
                         */
                        //   is = new ProcessInputStream("/usr/bin/gzip -dc ", filename);
                        //}
                        //else
                    is = Files.openFileStream(filename); //throws an IOException, throw upwards
                    logger.info(this.getClass().getSimpleName() + " processing " + filename);
                    //no need to loop again
                    tryFile = false;
                    //return success
                    rtr = true;
                    //accurately record file offset
                    documentsInThisFile = 0;
                    eof = false;
                }
            } else {
                //last file of the collection has been read, EOC
                eoc = true;
                rtr = false;
                tryFile = false;
            }
        }
        return rtr;
    }

    /** read in the collection.spec */
    protected void readCollectionSpec(String CollectionSpecFilename) {
        //reads the collection specification file
        try {
            BufferedReader br2 = Files.openFileReader(CollectionSpecFilename);
            String filename = null;
            FilesToProcess = new ArrayList<String>();
            while ((filename = br2.readLine()) != null) {
                filename = filename.trim();
                if (!filename.startsWith("#") && !filename.equals(""))
                    FilesToProcess.add(filename);
            }
            br2.close();
            logger.info(this.getClass().getSimpleName() + " read collection specification");
        } catch (IOException ioe) {
            logger.error("Input output exception while loading the collection.spec file. " + "("
                    + CollectionSpecFilename + ")", ioe);
        }
    }

    /** Resets the Collection iterator to the start of the collection. */
    public void reset() {
    }

    final static String parseDate(String date) {
        if (date == null)
            return "";
        try {
            return Long.toString(dateWARC.parse(date).getTime());
        } catch (ParseException pe) {
            return "";
        }
    }

}