org.commoncrawl.util.ArcFileWriter.java Source code

Introduction

Here is the source code for org.commoncrawl.util.ArcFileWriter.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Date;
import java.util.LinkedList;
import java.util.Map;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.locks.AbstractQueuedSynchronizer;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableName;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOBufferListOutputStream;
import org.commoncrawl.io.NIODataSink;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.ArcFileWriterStats;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.MimeTypeCount;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.GZIPUtils.UnzipResult;
import org.junit.Test;

import com.google.common.collect.TreeMultimap;

/**
 * 
 * @author rana
 * 
 */
public class ArcFileWriter {

    /** logging **/
    private static final Log LOG = LogFactory.getLog(ArcFileWriter.class);

    private static SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat("yyyyMMddHHmmss");
    private static SimpleDateFormat FILENAME_TIMESTAMP = new SimpleDateFormat("yyyy/MM/dd/");

    private static final int MAX_SIZE_DEFAULT = 100000000;
    private static final int MAX_WRITERS_DEFAULT = 10;
    private static final String DEFAULT_ENCODING = "ISO-8859-1";
    private static final String ARC_MAGIC_NUMBER = "filedesc://";
    public static final char LINE_SEPARATOR = '\n';
    private static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, 0, 0, 0, 0 };
    private final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*");
    private static final String NO_TYPE_MIMETYPE = "no-type";
    private static final int MAX_METADATA_LINE_LENGTH = (8 * 1024);
    private static final Pattern METADATA_LINE_PATTERN = Pattern
            .compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
    private static final char HEADER_FIELD_SEPARATOR = ' ';
    private static final String UTF8 = "UTF-8";

    private FileSystem _fileSystem;
    private Path _outputPath;
    private int _id;
    private int _maxSize = MAX_SIZE_DEFAULT;
    private int _maxWriters = MAX_WRITERS_DEFAULT;
    private Semaphore _maxWritersSemaphore = null;
    private Vector<ArcFile> _arcFiles = new Vector<ArcFile>();
    private String _activeFileName = null;
    private int _lastItemPos = -1;
    private int _lastItemCompressedSize = -1;
    private TreeMultimap<String, Integer> _mimeTypeCounts = TreeMultimap.create();

    private OutputStream _out = null;
    private static BitSet dontNeedEncoding;
    static final int caseDiff = ('a' - 'A');

    static {

        dontNeedEncoding = new BitSet(256);
        // alpha characters
        for (int i = 'a'; i <= 'z'; i++) {
            dontNeedEncoding.set(i);
        }
        for (int i = 'A'; i <= 'Z'; i++) {
            dontNeedEncoding.set(i);
        }
        // numeric characters
        for (int i = '0'; i <= '9'; i++) {
            dontNeedEncoding.set(i);
        }
        // special chars
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('~');
        dontNeedEncoding.set('_');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('*');
        dontNeedEncoding.set('/');
        dontNeedEncoding.set('=');
        dontNeedEncoding.set('&');
        dontNeedEncoding.set('+');
        dontNeedEncoding.set(',');
        dontNeedEncoding.set(':');
        dontNeedEncoding.set(';');
        dontNeedEncoding.set('@');
        dontNeedEncoding.set('$');
        dontNeedEncoding.set('!');
        dontNeedEncoding.set(')');
        dontNeedEncoding.set('(');
        // experiments indicate: Firefox (1.0.6) never escapes '%'
        dontNeedEncoding.set('%');
        // experiments indicate: Firefox (1.0.6) does not escape '|' or '''
        dontNeedEncoding.set('|');
        dontNeedEncoding.set('\'');
    }

    private static class BufferItem {

        public BufferItem(ByteBuffer bufferItem) {
            _buffer = bufferItem;
        }

        public ByteBuffer _buffer;
    };

    private static final class ThreadSync extends AbstractQueuedSynchronizer {

        /**
         * 
         */
        private static final long serialVersionUID = 8771504638721679952L;

        ThreadSync() {
            setState(0);
        }

        int getCount() {
            return getState();
        }

        public int tryAcquireShared(int acquires) {
            return getState() == 0 ? 1 : -1;
        }

        public boolean tryReleaseShared(int releases) {
            // Decrement count; signal when transition to zero
            for (;;) {
                int c = getState();
                if (c == 0)
                    return false;
                int nextc = c - 1;
                if (compareAndSetState(c, nextc))
                    return nextc == 0;
            }
        }

        public void incrementCount() {

            // loop until we can atomically increment ...
            for (;;) {
                int c = getState();
                int nextc = c + 1;
                if (compareAndSetState(c, nextc))
                    break;
            }
        }

    }

    private ThreadSync _activeWriterCount = new ThreadSync();

    private final class ArcFile implements NIODataSink {

        private Path _hdfsPath;
        private NIOBufferList _buffer = new NIOBufferList();
        private NIOBufferListOutputStream _nioStream = new NIOBufferListOutputStream(_buffer);
        private int _streamPos = 0;
        public int _totalHeaderBytesWritten = 0;
        public int _totalContentBytesWritten = 0;
        public int _itemsWritten = 0;
        public int _compressedBytesWritten = 0;
        private final ReentrantLock queueLock = new ReentrantLock();

        private OutputStream _out = new FilterOutputStream(_nioStream) {

            @Override
            public void write(int b) throws IOException {
                ++_streamPos;
                _nioStream.write(b);
            }

            @Override
            public void write(byte[] b, int off, int len) throws IOException {
                _streamPos += len;
                _nioStream.write(b, off, len);
            };
        };

        private LinkedBlockingQueue<BufferItem> _consumerQueue = new LinkedBlockingQueue<BufferItem>();
        private LinkedList<BufferItem> _rewindQueue = new LinkedList<BufferItem>();
        private FSDataOutputStream _hdfsStream = null;
        private FileSystem _hdfs = null;
        private Thread _hdfsWriterThread = null;
        private long _timestamp;
        // bytes consumed via Blocking Consumer interface ...
        int _bytesConsumed = 0;
        private boolean _abort = false;

        // failure exception ... if any ...
        private IOException _failureException = null;

        private void restartWrite() throws IOException {
            LOG.info("Restarting Write of File:" + _hdfsPath);
            if (_hdfsStream != null) {
                LOG.warn("HDFSStream != NULL for File:" + _hdfsPath + " during restart");
                _hdfsStream.close();
                _hdfsStream = null;
            }

            LOG.info("REWIND - Deleting File :" + _hdfsPath);
            // delete existing ...
            _hdfs.delete(_hdfsPath);
            LOG.info("REWIND - ReCreating File :" + _hdfsPath);
            // create new file stream ...
            _hdfsStream = _hdfs.create(_hdfsPath);
            // lock queue
            try {
                queueLock.lock();

                ArrayList<BufferItem> itemList = new ArrayList<BufferItem>();
                LOG.info("REWIND - There are:" + _rewindQueue.size() + " Items in the Rewind Queue for File :"
                        + _hdfsPath);
                itemList.addAll(_rewindQueue);
                LOG.info("REWIND - There are:" + _consumerQueue.size() + " Items in the Consumer Queue for File :"
                        + _hdfsPath);
                _consumerQueue.drainTo(_rewindQueue);
                _consumerQueue.clear();

                int itemCount = 0;
                for (BufferItem bufferItem : itemList) {
                    _consumerQueue.offer(bufferItem);
                    itemCount++;
                }
                LOG.info("REWIND - There should be:" + itemCount + " Items in the Consumer Queue for File :"
                        + _hdfsPath);
                _rewindQueue.clear();
            } finally {
                queueLock.unlock();
            }
        }

        public ArcFile(FileSystem fileSystem, Path arcFilePath, long timestamp) throws IOException {
            // first things first ... we need to acquire the writer semaphore ...
            _maxWritersSemaphore.acquireUninterruptibly();
            // increment thread count in parent class ...
            _activeWriterCount.incrementCount();
            // store hdfs filesystem reference ...
            _hdfs = fileSystem;
            // and the path to our arc file ...
            _hdfsPath = arcFilePath;
            // delete existing ...
            _hdfs.delete(_hdfsPath);
            // create new file stream ...
            _hdfsStream = _hdfs.create(_hdfsPath);
            // and setup the consumer queue relationship
            _buffer.setSink(this);
            // store timestamp that was used to create unique filename
            _timestamp = timestamp;

            // and finally start the blocking writer thread ...
            _hdfsWriterThread = new Thread(new Runnable() {

                public void run() {

                    LOG.info("Writing File:" + _hdfsPath.toString());
                    test: for (;;) {
                        try {
                            BufferItem item = _consumerQueue.take();

                            // add item to rewind queue
                            _rewindQueue.addLast(item);

                            // if buffer item is null... this is considered an eof condition
                            // ... break out ...
                            if (item._buffer == null) {
                                // LOG.info("Received Null BufferItem ... Shutting down File:" +
                                // _hdfsPath.toString());
                                // time to shutdown stream ...
                                try {
                                    _hdfsStream.flush();
                                    _hdfsStream.close();
                                    _hdfsStream = null;
                                    break;
                                } catch (IOException e) {
                                    if (!_abort) {
                                        LOG.error("Exception During Flush of File:" + _hdfsPath
                                                + "(Restarting)  Exception:" + CCStringUtils.stringifyException(e));
                                        try {
                                            _hdfsStream = null;
                                            restartWrite();
                                            continue test;
                                        } catch (IOException e2) {
                                            LOG.error("Restart of Stream:" + _hdfsPath.toString()
                                                    + " Failed with Exception:"
                                                    + CCStringUtils.stringifyException(e2));
                                            _failureException = e2;
                                            // break out of outer loop
                                            break;
                                        }
                                    } else {
                                        LOG.error("Aborting Operation for File:" + _hdfsPath);
                                        break;
                                    }
                                }
                            }
                            // otherwise ... write the
                            else {

                                try {

                                    int arrayOffset = item._buffer.arrayOffset();
                                    arrayOffset += item._buffer.position();
                                    int end = item._buffer.limit();
                                    byte[] byteBuffer = item._buffer.array();

                                    // LOG.info("Wrote:" + (end-arrayOffset) + "bytes for File:" +
                                    // _hdfsPath.toString());
                                    // write the buffer to disk ...
                                    _hdfsStream.write(byteBuffer, arrayOffset, end - arrayOffset);

                                } catch (IOException e) {
                                    try {
                                        _hdfsStream.close();
                                    } catch (IOException e2) {
                                        LOG.error("Ignoring Exception During Close:"
                                                + CCStringUtils.stringifyException(e2));
                                    } finally {
                                        _hdfsStream = null;
                                    }

                                    if (!_abort) {
                                        LOG.error("Exception During Write of File:" + _hdfsPath
                                                + "(Restarting)  Exception:" + CCStringUtils.stringifyException(e));
                                        try {
                                            restartWrite();
                                            continue test;
                                        } catch (IOException e2) {
                                            LOG.error("Restart of Stream:" + _hdfsPath.toString()
                                                    + " Failed with Exception:"
                                                    + CCStringUtils.stringifyException(e2));
                                            _failureException = e2;
                                            // break out of outer loop
                                            break;
                                        }
                                    } else {
                                        LOG.error("Aborting Operation for File:" + _hdfsPath);

                                        break;
                                    }
                                }
                            }
                        } catch (InterruptedException e) {

                        }
                    }

                    LOG.info("Finished Writing File:" + _hdfsPath.toString() + ". Clearing Rewind Queue");
                    _rewindQueue.clear();
                    // release our reference to ourselves ...
                    _hdfsWriterThread = null;
                    // and release the semaphore ...
                    _maxWritersSemaphore.release();
                    // decrement the active thread count ...
                    _activeWriterCount.releaseShared(1);
                }
            });
            // launch the writer thread ...
            _hdfsWriterThread.start();
        }

        public void available(ByteBuffer availableReadBuffer) {
            try {
                queueLock.lock();
                _consumerQueue.offer(new BufferItem(availableReadBuffer));
                _bytesConsumed += availableReadBuffer.remaining();
            } finally {
                queueLock.unlock();
            }
        }

        public void finished() {
            // NOOP
        }

        public void freeze() {
            // add empty buffer to consumer queue ... which will trigger writer thread
            // to flush and terminate ...
            _consumerQueue.offer(new BufferItem(null));
        }

        public OutputStream getOutputStream() {
            return _out;
        }

        public IOException getFailureException() {
            return _failureException;
        }

        public long getTimestamp() {
            return _timestamp;
        }

        /**
         * get the stream position (the number of bytes written to the output stream
         * (or file) )
         */
        public int getStreamPos() {
            return _streamPos;
        }

        /** get the estimated output file size **/
        public int getFileSize() {
            int fileSizeOut = 0;

            // pickup anything pending (uflushed) data ...
            ByteBuffer writeBuffer = _buffer.peekAtWriteBuffer();

            if (writeBuffer != null) {
                fileSizeOut += writeBuffer.capacity() - writeBuffer.remaining();
            }
            fileSizeOut += _bytesConsumed;

            return fileSizeOut;
        }

        public void flush() {
            _buffer.flush();
        }

        public void close() {
            if (_hdfsWriterThread != null) {
                throw new RuntimeException("Arc File close called w/ writer thread still running ...!");
            }
            // ok ... either is called in a clean state or NOT in a clean state ...
            // if stream is open ... non-clean state ... close it ...
            if (_hdfsStream != null) {
                _abort = true;
                try {
                    _hdfsStream.close();
                    _hdfsStream = null;
                } catch (IOException e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }
                // time to delete the underlying file since it is corrupt ...
                try {
                    _hdfs.delete(_hdfsPath);
                } catch (IOException e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }
                // and set error condition (if not already set)
                if (_failureException == null) {
                    _failureException = new IOException("ArcFile close called on file in improper state");
                }
            }
        }

        public void delete() {
            try {
                _hdfs.delete(_hdfsPath);
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }

    }

    /** Unit Test Constructor ***/
    public ArcFileWriter() throws IOException {

        if (CrawlEnvironment.getHadoopConfig() == null) {
            Configuration conf = new Configuration();

            conf.addResource("commoncrawl-default.xml");
            conf.addResource("commoncrawl-site.xml");

            CrawlEnvironment.setHadoopConfig(conf);
        }

        _fileSystem = CrawlEnvironment.getDefaultFileSystem();
        _outputPath = new Path("crawl/test");
        _id = 1;
        _maxWritersSemaphore = new Semaphore(_maxWriters);
        rotateFile();
    }

    /**
     * constructor for arc file writer *
     * 
     * @throws IOException
     */
    public ArcFileWriter(FileSystem fileSystem, Path outputPath, int writerId, int maxSimultaneousWriters)
            throws IOException {

        _fileSystem = fileSystem;
        _outputPath = outputPath;
        _id = writerId;
        _maxWriters = maxSimultaneousWriters;
        _maxWritersSemaphore = new Semaphore(_maxWriters);

        // set up the initial arc file .
        rotateFile();
    }

    @Test
    public void testArcFileWriter() throws Exception {

        Path crawlFilePath = new Path("crawl/checkpoint_data/CrawlLog_cc08_1210918849380");

        WritableName.setName(CrawlURL.class, "org.crawlcommons.protocol.CrawlURL");

        SequenceFile.Reader reader = new SequenceFile.Reader(_fileSystem, crawlFilePath,
                CrawlEnvironment.getHadoopConfig());

        Text url = new Text();
        CrawlURL urlData = new CrawlURL();

        while (reader.next(url, urlData)) {

            NIOHttpHeaders headers = CrawlURLHelper.getHeadersFromCrawlURL(urlData);
            write(url.toString(), 1, 1, urlData, headers, "text/html", "test");
        }

        reader.close();
        this.close(false);
    }

    public ArcFileWriterStats close(boolean purgeOutput) throws IOException {

        ArcFileWriterStats statsOut = new ArcFileWriterStats();

        if (getActiveFile() != null) {
            LOG.info("Closing ArcFileWriter ... flushing active file");
            // flush any partial writes ...
            getActiveFile().flush();
            getActiveFile().freeze();
        }

        LOG.info("Generating Stats");
        // flush mime type stats
        for (Map.Entry<String, Integer> mimeTypeEntry : _mimeTypeCounts.entries()) {
            MimeTypeCount mimeTypeCount = new MimeTypeCount();
            mimeTypeCount.setMimeType(mimeTypeEntry.getKey());
            mimeTypeCount.setCount(mimeTypeEntry.getValue());
            statsOut.getMimeTypeCounts().add(mimeTypeCount);
        }
        _mimeTypeCounts.clear();

        SmoothedAverage itemsPerArcFileAvg = new SmoothedAverage(.25);
        for (ArcFile arcFile : _arcFiles) {
            statsOut.setArcFilesWritten(statsOut.getArcFilesWritten() + 1);
            statsOut.setTotalItemsWritten(statsOut.getTotalItemsWritten() + arcFile._itemsWritten);
            itemsPerArcFileAvg.addSample(arcFile._itemsWritten);
            statsOut.setHeaderBytesWritten(statsOut.getHeaderBytesWritten() + arcFile._totalHeaderBytesWritten);
            statsOut.setContentBytesWritten(statsOut.getContentBytesWritten() + arcFile._totalContentBytesWritten);
            statsOut.setCompressedBytesWritten(
                    statsOut.getCompressedBytesWritten() + arcFile._compressedBytesWritten);
        }
        statsOut.setAverageItemsPerFile((float) itemsPerArcFileAvg.getAverage());

        LOG.info("Closing ArcFileWriter ... waiting for all writers to complete");
        // now wait for all arc files writes to finish ...
        _activeWriterCount.acquireShared(1);
        LOG.info("Closing ArcFileWriter ... all writers completed. closing files");

        IOException exceptionOut = null;

        // now walk arc files collecting any exceptions ...
        for (ArcFile arcFile : _arcFiles) {
            if (arcFile.getFailureException() != null) {
                exceptionOut = arcFile.getFailureException();
            }
            arcFile.close();
        }

        LOG.info("Closing ArcFileWriter ... close complete");

        if (purgeOutput) {
            LOG.info("Purging ArcFiles Due to Possible Error");
            for (ArcFile arcFile : _arcFiles) {
                arcFile.delete();
            }
        }
        _arcFiles.clear();

        if (exceptionOut != null)
            throw exceptionOut;

        return statsOut;
    }

    private String escapeURI(String uri, String charsetEncoding) throws IOException {

        boolean needToChange = false;

        StringBuffer out = new StringBuffer(uri.length());

        Charset charset;

        CharArrayWriter charArrayWriter = new CharArrayWriter();

        if (charsetEncoding == null)
            throw new NullPointerException("charsetName");

        try {
            charset = Charset.forName(charsetEncoding);
        } catch (IllegalCharsetNameException e) {
            throw new UnsupportedEncodingException(charsetEncoding);
        } catch (UnsupportedCharsetException e) {
            throw new UnsupportedEncodingException(charsetEncoding);
        }

        for (int i = 0; i < uri.length();) {
            int c = (int) uri.charAt(i);
            // System.out.println("Examining character: " + c);
            if (dontNeedEncoding.get(c)) {
                out.append((char) c);
                i++;
            } else {
                // convert to external encoding before hex conversion
                do {
                    charArrayWriter.write(c);
                    /*
                     * If this character represents the start of a Unicode surrogate pair,
                     * then pass in two characters. It's not clear what should be done if
                     * a bytes reserved in the surrogate pairs range occurs outside of a
                     * legal surrogate pair. For now, just treat it as if it were any
                     * other character.
                     */
                    if (c >= 0xD800 && c <= 0xDBFF) {
                        /*
                         * System.out.println(Integer.toHexString(c) +
                         * " is high surrogate");
                         */
                        if ((i + 1) < uri.length()) {
                            int d = (int) uri.charAt(i + 1);
                            /*
                             * System.out.println("\tExamining " + Integer.toHexString(d));
                             */
                            if (d >= 0xDC00 && d <= 0xDFFF) {
                                /*
                                 * System.out.println("\t" + Integer.toHexString(d) +
                                 * " is low surrogate");
                                 */
                                charArrayWriter.write(d);
                                i++;
                            }
                        }
                    }
                    i++;
                } while (i < uri.length() && !dontNeedEncoding.get((c = (int) uri.charAt(i))));

                charArrayWriter.flush();
                String str = new String(charArrayWriter.toCharArray());
                byte[] ba = str.getBytes(charsetEncoding);
                for (int j = 0; j < ba.length; j++) {
                    out.append('%');
                    char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
                    // converting to use uppercase letter as part of
                    // the hex value if ch is a letter.
                    if (Character.isLetter(ch)) {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                    ch = Character.forDigit(ba[j] & 0xF, 16);
                    if (Character.isLetter(ch)) {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                }
                charArrayWriter.reset();
                needToChange = true;
            }
        }

        return (needToChange ? out.toString() : uri);
    }

    /**
     * write a url entry via the arc file writer NOTE: BY DESIGN this call could
     * BLOCK if the number of active writers exceeds the value specified by
     * maxSimultaneousWriters (in the constructor)
     * **/
    public boolean write(String normalizedURL, int segmentid, int crawlNumber, CrawlURL urlItem,
            NIOHttpHeaders headers, String contentType, String signature) throws IOException {

        boolean generatedARCFileContent = false;

        // String encodedURI = escapeURI(normalizedURL,UTF8);
        String encodedURI = normalizedURL;
        GoogleURL url = new GoogleURL(normalizedURL);
        if (url.isValid()) {
            encodedURI = url.getCanonicalURL();
        }

        int hostIP = urlItem.getServerIP();
        String hostIPStr = IPAddressUtils.IntegerToIPAddressString(hostIP);
        long fetchBeginTimestamp = urlItem.getLastAttemptTime();
        String encoding = headers.findValue("Content-Encoding");
        String truncationFlags = "";
        if ((urlItem.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
            truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInDownload);
        }

        byte[] crawlData = urlItem.getContentRaw().getReadOnlyBytes();
        int crawlDataLen = (crawlData != null) ? crawlData.length : 0;

        // validate content type ...
        if (contentType == null) {
            LOG.error("URL:" + normalizedURL + " Rejected - Invalid Content Type:" + contentType);
        } else {

            if (crawlData != null && encoding != null && encoding.equalsIgnoreCase("gzip")) {
                int compressedSize = crawlData.length;
                try {
                    UnzipResult result = GZIPUtils.unzipBestEffort(crawlData, CrawlEnvironment.CONTENT_SIZE_LIMIT);

                    crawlData = result.data.get();
                    crawlDataLen = result.data.getCount();

                    if (result.wasTruncated) {
                        if (truncationFlags.length() != 0)
                            truncationFlags += ",";
                        truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate);
                    }
                } catch (Exception e) {
                    LOG.error("URL:" + normalizedURL + " Rejected - GZIP Decompression Failed");
                    crawlData = null;
                }
            }

            // content must not be null
            if (crawlData == null) {
                LOG.error("URL:" + normalizedURL + " Rejected - Content is NULL");
            } else {

                // add in our custom headers ...
                headers.add(Constants.ARCFileHeader_ParseSegmentId, ((Integer) segmentid).toString());
                headers.add(Constants.ARCFileHeader_OriginalURL, normalizedURL);

                headers.add(Constants.ARCFileHeader_URLFP, Long.toString(urlItem.getFingerprint()));
                headers.add(Constants.ARCFileHeader_HostFP, Long.toString(urlItem.getHostFP()));
                headers.add(Constants.ARCFileHeader_Signature, signature);
                headers.add(Constants.ARCFileHeader_CrawlNumber, Integer.toString(crawlNumber));
                headers.add(Constants.ARCFileHeader_FetchTimeStamp, Long.toString(urlItem.getLastAttemptTime()));
                // headers.add(Environment.ARCFileHeader_CrawlerId,
                // Integer.toString((int)urlItem.get));

                if (truncationFlags.length() != 0) {
                    headers.add(Constants.ARCFileHeader_ContentTruncated, truncationFlags);
                }

                String headerString = headers.toString() + "\r\n";

                byte[] headerBytes = headerString.getBytes("UTF-8");

                // content is truncated further upstream, so this redundant check /
                // truncation is problematic
                // int contentLength = Math.min(crawlData.length,CONTENT_SIZE_LIMIT);

                // extract metadata line upfront, since if the url exceeds a certain
                // size limit , we are going to reject the entry...
                byte metaDataLine[];

                try {
                    metaDataLine = getMetaLine(encodedURI, contentType, hostIPStr, fetchBeginTimestamp,
                            crawlDataLen + headerBytes.length).getBytes(UTF8);
                } catch (IOException e) {
                    LOG.error("Metadata Line Validation FAILED with Exception:"
                            + CCStringUtils.stringifyException(e));
                    // bail here ...
                    return false;
                }

                // get ready to write out a new gziped entry ...
                preWriteRecordTasks(headerBytes.length, crawlDataLen, contentType);
                try {
                    // read to write an entry ...
                    write(metaDataLine);

                    // write out the headers ...
                    write(headerBytes, 0, headerBytes.length);
                    // write out the content
                    write(crawlData, 0, crawlDataLen);
                    // line separator ...
                    write(LINE_SEPARATOR);

                    // indicate success ...
                    generatedARCFileContent = true;

                } finally {
                    // flush the gzip stream...
                    postWriteRecordTasks();
                }
            }
        }

        return generatedARCFileContent;
    }

    /**
     * 
     * @return timestamp of the current arc file
     */
    public long getActiveFileTimestamp() {
        return getActiveFile().getTimestamp();
    }

    /**
     * 
     * @return the position in the arc file of the last written item
     */
    public int getLastItemPos() {
        return _lastItemPos;
    }

    /**
     * 
     * @return the compressed size (within the arc file) of the last written item
     */
    public int getLastItemCompressedSize() {
        return _lastItemCompressedSize;
    }

    private ArcFile getActiveFile() {
        if (_arcFiles.size() != 0) {
            return _arcFiles.lastElement();
        }
        return null;
    }

    private static NIOHttpHeaders getHeadersFromString(String headers) {

        NIOHttpHeaders headersOut = new NIOHttpHeaders();

        StringTokenizer tokenizer = new StringTokenizer(headers, "\r\n");

        while (tokenizer.hasMoreElements()) {
            String token = tokenizer.nextToken();

            if (token != null && token.length() != 0) {
                int colonPos = token.indexOf(':');

                if (colonPos != -1 && colonPos != token.length() - 1) {

                    String key = token.substring(0, colonPos);
                    String value = token.substring(colonPos + 1);

                    if (key.length() != 0 && value.length() != 0) {
                        headersOut.add(key, value);
                    }
                } else {
                    headersOut.add(null, token);
                }

            }
        }
        return headersOut;
    }

    public static String getMetaLine(String uri, String contentType, String hostIP, long fetchBeginTimeStamp,
            long recordLength) throws IOException {

        if (fetchBeginTimeStamp <= 0) {
            throw new IOException("Bogus fetchBeginTimestamp: " + Long.toString(fetchBeginTimeStamp));
        }

        return createMetaline(uri, hostIP, TIMESTAMP14.format(new Date(fetchBeginTimeStamp)), contentType,
                Long.toString(recordLength));
    }

    public static String createMetaline(String uri, String hostIP, String timeStamp, String mimetype,
            String recordLength) {
        return uri + HEADER_FIELD_SEPARATOR + hostIP + HEADER_FIELD_SEPARATOR + timeStamp + HEADER_FIELD_SEPARATOR
                + mimetype + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
    }

    protected void rotateFile() throws IOException {

        if (getActiveFile() != null) {

            ArcFile activeFile = getActiveFile();

            // flush any partial writes ...
            activeFile.flush();
            // close it ...
            activeFile.freeze();

        }

        // generate a timestamp value ...
        long timestamp = System.currentTimeMillis();

        // create a new arc file based on path and timestamp
        _activeFileName = generateNewARCFilename(timestamp);

        // create arc file path ...
        Path arcFilePath = new Path(_outputPath, _activeFileName);
        // and create a new ArcFile object ...
        ArcFile newArcFile = new ArcFile(_fileSystem, arcFilePath, timestamp);
        // and make it the active arc file ...
        _arcFiles.add(newArcFile);
        // and set up output stream ...
        _out = newArcFile.getOutputStream();
        // and write out firt record ...
        writeFirstRecord(TIMESTAMP14.format(new Date(System.currentTimeMillis())));
    }

    private String generateNewARCFilename(long timestamp) {
        return timestamp + "_" + _id + ".arc.gz";
        /*
         * Date date = new Date(timestamp); String arcFileName =
         * FILENAME_TIMESTAMP.format(date) + timestamp + "-" + _id + "arc.gz";
         * return arcFileName;
         */
    }

    private String getARCFilename() {
        return _activeFileName;
    }

    /**
     * Call this method just before/after any significant write.
     * 
     * Call at the end of the writing of a record or just before we start writing
     * a new record. Will close current file and open a new file if file size has
     * passed out maxSize.
     * 
     * <p>
     * Creates and opens a file if none already open. One use of this method then
     * is after construction, call this method to add the metadata, then call
     * {@link #getPosition()} to find offset of first record.
     * 
     * @exception IOException
     */
    private void checkSize(int headerBytesLength, int contentBytesLength) throws IOException {
        if (getActiveFile() == null || (_maxSize != -1 && (getActiveFile().getFileSize() > _maxSize))) {
            rotateFile();
        }
    }

    /**
     * append a pre-generated arcfile entry directly into the arc file writer
     * 
     * @param arcFileData
     *          - the compressed arc file entry
     * @param dataBufferLength
     *          - the entry length
     * @throws IOException
     */
    public void writeRawArcFileItem(String contentType, byte[] arcFileData, int dataBufferLength)
            throws IOException {
        // check to see if we need to start a new underlying file
        checkSize(0, dataBufferLength);
        // update stats
        getActiveFile()._totalContentBytesWritten += dataBufferLength;
        getActiveFile()._itemsWritten++;
        SortedSet<Integer> counts = _mimeTypeCounts.get(contentType);
        if (counts.size() == 0) {
            counts.add(1);
        } else {
            int count = counts.first() + 1;
            counts.clear();
            counts.add(count);
        }
        // record start position of this item
        _lastItemPos = getActiveFile().getFileSize();
        // write out data
        _out.write(arcFileData, 0, dataBufferLength);
        // record size of last item
        _lastItemCompressedSize = (getActiveFile().getFileSize() - _lastItemPos);
        // update stats
        getActiveFile()._compressedBytesWritten += _lastItemCompressedSize;
    }

    private void preWriteRecordTasks(int headerBytesLength, int contentBytesLength, String contentType)
            throws IOException {

        checkSize(headerBytesLength, contentBytesLength);

        // update stats
        getActiveFile()._totalHeaderBytesWritten += headerBytesLength;
        getActiveFile()._totalContentBytesWritten += contentBytesLength;
        getActiveFile()._itemsWritten++;
        SortedSet<Integer> counts = _mimeTypeCounts.get(contentType);
        if (counts.size() == 0) {
            counts.add(1);
        } else {
            int count = counts.first() + 1;
            counts.clear();
            counts.add(count);
        }

        // record start position of this item
        _lastItemPos = getActiveFile().getFileSize();

        // Wrap stream in GZIP Writer.
        // The below construction immediately writes the GZIP 'default'
        // header out on the underlying stream.
        _out = new CompressedStream(_out);
    }

    private void postWriteRecordTasks() throws IOException {
        CompressedStream o = (CompressedStream) _out;
        o.finish();
        o.flush();
        o.end();
        _out = o.getWrappedStream();
        // record size of last item
        _lastItemCompressedSize = (getActiveFile().getFileSize() - _lastItemPos);
        // update stats
        getActiveFile()._compressedBytesWritten += _lastItemCompressedSize;
    }

    private void write(final byte[] b, int offset, int size) throws IOException {
        _out.write(b, offset, size);
    }

    private void write(final byte[] b) throws IOException {
        _out.write(b);
    }

    private void write(int b) throws IOException {
        _out.write(b);
    }

    private void writeFirstRecord(final String ts) throws IOException {
        write(generateARCFileMetaData(ts));
    }

    /**
     * An override so we get access to underlying output stream and offer an end()
     * that does not accompany closing underlying stream.
     * 
     * @author stack
     */
    public static class CompressedStream extends GZIPOutputStream {
        public CompressedStream(OutputStream out) throws IOException {
            super(out);
        }

        /**
         * @return Reference to stream being compressed.
         */
        OutputStream getWrappedStream() {
            return this.out;
        }

        /**
         * Release the deflater's native process resources, which otherwise would
         * not occur until either finalization or DeflaterOutputStream.close()
         * (which would also close underlying stream).
         */
        public void end() {
            def.end();
        }
    }

    /**
     * Gzip passed bytes. Use only when bytes is small.
     * 
     * @param bytes
     *          What to gzip.
     * @return A gzip member of bytes.
     * @throws IOException
     */
    private static byte[] gzip(byte[] bytes) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        GZIPOutputStream gzipOS = new GZIPOutputStream(baos);
        gzipOS.write(bytes, 0, bytes.length);
        gzipOS.close();
        return baos.toByteArray();
    }

    private byte[] generateARCFileMetaData(String date) throws IOException {

        String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 " + "0");
        int recordLength = metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
        String metadataHeaderStr = ARC_MAGIC_NUMBER + getARCFilename() + " 0.0.0.0 " + date + " text/plain "
                + recordLength + metadataHeaderLinesTwoAndThree;

        ByteArrayOutputStream metabaos = new ByteArrayOutputStream(recordLength);

        // Write the metadata header.
        metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
        // Write out a LINE_SEPARATORs to end this record.
        metabaos.write(LINE_SEPARATOR);

        // Now get bytes of all just written and compress if flag set.
        byte[] bytes = metabaos.toByteArray();

        // GZIP the header but catch the gzipping into a byte array so we
        // can add the special IA GZIP header to the product. After
        // manipulations, write to the output stream (The JAVA GZIP
        // implementation does not give access to GZIP header. It
        // produces a 'default' header only). We can get away w/ these
        // maniupulations because the GZIP 'default' header doesn't
        // do the 'optional' CRC'ing of the header.

        byte[] gzippedMetaData = gzip(bytes);

        if (gzippedMetaData[3] != 0) {
            throw new IOException(
                    "The GZIP FLG header is unexpectedly " + " non-zero.  Need to add smarter code that can deal "
                            + " when already extant extra GZIP header fields.");
        }

        // Set the GZIP FLG header to '4' which says that the GZIP header
        // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
        // '0'} 'extra' field. The IA GZIP header will also set byte
        // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
        gzippedMetaData[3] = 4;
        gzippedMetaData[9] = 3;

        byte[] assemblyBuffer = new byte[gzippedMetaData.length + ARC_GZIP_EXTRA_FIELD.length];
        // '10' in the below is a pointer past the following bytes of the
        // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
        // RFC1952 for explaination of the abbreviations just used.
        System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
        System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, ARC_GZIP_EXTRA_FIELD.length);
        System.arraycopy(gzippedMetaData, 10, assemblyBuffer, 10 + ARC_GZIP_EXTRA_FIELD.length,
                gzippedMetaData.length - 10);
        bytes = assemblyBuffer;

        return bytes;
    }

    private String getMetadataHeaderLinesTwoAndThree(String version) {
        StringBuffer buffer = new StringBuffer();
        buffer.append(LINE_SEPARATOR);
        buffer.append(version);
        buffer.append(" CommonCrawl");
        buffer.append(LINE_SEPARATOR);
        buffer.append("URL IP-address Archive-date Content-type Archive-length");
        buffer.append(LINE_SEPARATOR);
        return buffer.toString();
    }

    private static String truncateMimeType(String contentType) {
        if (contentType == null) {
            contentType = NO_TYPE_MIMETYPE;
        } else {
            Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
            if (matcher.matches()) {
                contentType = matcher.group(1);
            } else {
                contentType = NO_TYPE_MIMETYPE;
            }
        }

        return contentType;
    }

    /**
     * Test that the metadata line is valid before writing.
     * 
     * @param metaLineStr
     * @throws IOException
     * @return The passed in metaline.
     */
    protected String validateMetaLine(String metaLineStr) throws IOException {
        if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
            throw new IOException("Metadata line length is " + metaLineStr.length() + " which is > than maximum "
                    + MAX_METADATA_LINE_LENGTH);
        }
        Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
        if (!m.matches()) {
            throw new IOException("Metadata line doesn't match expected" + " pattern: " + metaLineStr);
        }
        return metaLineStr;
    }

}