org.commoncrawl.service.parser.ec2.EC2ParserNode.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.parser.ec2.EC2ParserNode.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.service.parser.ec2;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.UUID;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.Tuples.Pair;

import com.google.api.client.http.GenericUrl;
import com.google.api.client.http.HttpRequest;
import com.google.api.client.http.HttpRequestFactory;
import com.google.api.client.http.HttpResponse;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;

public class EC2ParserNode implements Runnable, Constants {

    public static final Log LOG = LogFactory.getLog(EC2ParserNode.class);

    Configuration _conf;
    Thread _thread;
    FileSystem _fs;
    UUID _uuid;
    String _masterHost;
    String _hostName;

    public EC2ParserNode(String hostName, Configuration conf) throws IOException, URISyntaxException {
        _conf = conf;
        _conf.set("fs.s3n.awsAccessKeyId", "079HD5ZAQSKEY542V7R2");
        _conf.set("fs.s3n.awsSecretAccessKey", "g4Ow3MSj77mqEw3uf4fZ22QPXuH991YP/rak8FJX");
        _fs = FileSystem.get(new URI("s3n://aws-publicdatasets/"), _conf);
        _uuid = UUID.randomUUID();
        _masterHost = "10.0.20.21";
        _hostName = hostName;
        startThread();
    }

    private static class QueueItem {
        public QueueItem(Path path) {
            crawlLogPath = path;
        }

        Path crawlLogPath;
    }

    LinkedBlockingQueue<QueueItem> _queue = new LinkedBlockingQueue<QueueItem>();

    private void startThread() {
        _thread = new Thread(this);
        _thread.start();
    }

    public void stop() {
        if (_thread != null) {
            try {
                LOG.info("Stopping Thread");
                _queue.put(new QueueItem(null));
                LOG.info("Waiting for Thread to Die");
                _thread.join();
                LOG.info("Thread dead");
                _thread = null;
            } catch (InterruptedException e) {
            }
        }
    }

    public void addToQueue(Path path) throws IOException {
        _queue.add(new QueueItem(path));
    }

    static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();

    GenericUrl buildCheckoutURL() {
        GenericUrl url = new GenericUrl();
        url.setScheme("http");
        url.setHost(_masterHost);
        url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
        url.setPathParts(Lists.newArrayList("", "checkout"));
        url.put("host", _hostName);
        url.put("uuid", _uuid.toString());
        LOG.info(url.build());
        return url;
    }

    GenericUrl buildPingURL(String activeFile, long pos) {
        GenericUrl url = new GenericUrl();
        url.setScheme("http");
        url.setHost(_masterHost);
        url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
        url.setPathParts(Lists.newArrayList("", "ping"));
        url.put("host", _hostName);
        url.put("uuid", _uuid.toString());
        url.put("activeFile", activeFile);
        url.put("pos", pos);

        LOG.info(url.build());
        return url;
    }

    GenericUrl buildCheckInURL(String activeFile, long pos) {
        GenericUrl url = new GenericUrl();
        url.setScheme("http");
        url.setHost(_masterHost);
        url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
        url.setPathParts(Lists.newArrayList("", "checkin"));
        url.put("host", _hostName);
        url.put("uuid", _uuid.toString());
        url.put("activeFile", activeFile);
        url.put("pos", pos);
        LOG.info(url.build());
        return url;
    }

    AtomicBoolean _shutdownActive = new AtomicBoolean();
    HttpRequestFactory factory = HTTP_TRANSPORT.createRequestFactory();

    private Pair<String, Long> checkoutFile() throws IOException {
        GenericUrl url = buildCheckoutURL();

        HttpRequest request = factory.buildGetRequest(url);

        HttpResponse response = request.execute();

        if (response.getStatusCode() == 200) {
            JsonParser parser = new JsonParser();
            JsonObject e = parser
                    .parse(new JsonReader(new InputStreamReader(response.getContent(), Charset.forName("UTF-8"))))
                    .getAsJsonObject();
            String logName = e.get("name").getAsString();
            long lastPos = e.get("lastPos").getAsLong();
            LOG.info("Got Name:" + logName + " Pos:" + lastPos);

            return new Pair<String, Long>(logName, lastPos);
        }
        return null;
    }

    public static final Path buildCrawlLogPath(String logName) {
        return new Path("/" + CC_BUCKET_ROOT + CC_CRAWLLOG_SOURCE + logName);
    }

    public static final Path buildCrawlLogCheckpointPath(String logName, long timestamp, long position) {
        return new Path("/" + CC_BUCKET_ROOT + CC_PARSER_INTERMEDIATE + logName + "_" + timestamp + "_" + position
                + DONE_SUFFIX);
    }

    private static final int CHECKPOINT_INTERVAL = 1 * 5 * 1000;

    @Override
    public void run() {

        while (!_shutdownActive.get()) {

            if (!_shutdownActive.get()) {

                try {
                    Pair<String, Long> checkoutInfo = checkoutFile();
                    if (checkoutInfo != null) {
                        Path logPath = buildCrawlLogPath(checkoutInfo.e0);
                        LOG.info("Opening File At LogPath:" + logPath);

                        SequenceFile.Reader reader = new SequenceFile.Reader(_fs, logPath, _conf);

                        long lastPos = checkoutInfo.e1;

                        LOG.info("Seeking to Pos:" + lastPos);

                        if (lastPos != 0) {
                            reader.seek(lastPos);
                        }
                        try {
                            Text key = new Text();
                            CrawlURL urlData = new CrawlURL();
                            long lastCheckpointTime = System.currentTimeMillis();
                            while (reader.next(key, urlData)) {
                                if (reader.getPosition() != lastPos) {
                                    if (System.currentTimeMillis() - lastCheckpointTime >= CHECKPOINT_INTERVAL) {
                                        doCheckpoint(checkoutInfo.e0, lastCheckpointTime, reader.getPosition(),
                                                false);
                                        lastPos = reader.getPosition();
                                        lastCheckpointTime = System.currentTimeMillis();
                                    }
                                }
                                LOG.info("Pos:" + reader.getPosition() + " Key:" + key.toString() + " ValueLen:"
                                        + urlData.getContentRaw().getCount());
                            }
                            doCheckpoint(checkoutInfo.e0, lastCheckpointTime, reader.getPosition(), true);
                        } finally {
                            reader.close();
                        }
                    }
                } catch (IOException e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }
            }

        }
    }

    private void doCheckpoint(String logFileName, long timestamp, long position, boolean isFinalCommit)
            throws IOException {
        Path checkpointFile = buildCrawlLogCheckpointPath(logFileName, timestamp, position);
        FSDataOutputStream outputStream = _fs.create(checkpointFile);
        try {
            outputStream.write(1);
            outputStream.flush();
        } finally {
            outputStream.close();
        }

        HttpRequest request = factory.buildGetRequest(
                (isFinalCommit) ? buildCheckInURL(logFileName, position) : buildPingURL(logFileName, position));
        HttpResponse response = request.execute();

        LOG.info("Checkpointing log:" + logFileName + " position:" + position);
        if (response.getStatusCode() == 200) {
            LOG.info("Checkpointing for log:" + logFileName + " position:" + position + " SUCCEEDED");
        } else {
            LOG.error("Checkpoint for log:" + logFileName + " position:" + position + " FAILED WITH ERROR: "
                    + response.getStatusCode() + " " + response.getStatusMessage());
        }
    }

    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            EC2ParserNode parser = new EC2ParserNode("test-host", conf);
            parser.addToQueue(new Path("/common-crawl/crawl-intermediate/CrawlLog_ccc01-01_1328300149459"));
            parser.stop();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        } catch (URISyntaxException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}