uk.bl.wa.hadoop.mapreduce.lib.DereferencingArchiveToCDXRecordReader.java Source code

Introduction

Here is the source code for uk.bl.wa.hadoop.mapreduce.lib.DereferencingArchiveToCDXRecordReader.java
Source

package uk.bl.wa.hadoop.mapreduce.lib;

/*
 * #%L
 * warc-hadoop-recordreaders
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.arc.ARCReader;
import org.archive.io.warc.WARCReader;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.cdx.SearchResultToCDXFormatAdapter;
import org.archive.wayback.resourceindex.cdx.format.CDXFormat;
import org.archive.wayback.resourceindex.cdx.format.CDXFormatException;
import org.archive.wayback.resourcestore.indexer.ArcIndexer;
import org.archive.wayback.resourcestore.indexer.WarcIndexer;

public class DereferencingArchiveToCDXRecordReader<Key extends WritableComparable<?>, Value extends Writable>
        extends RecordReader<Text, Text> {

    public static final String CDX_09 = " CDX N b a m s k r V g";
    public static final String CDX_10 = " CDX A b a m s k r M V g";
    public static final String CDX_11 = " CDX N b a m s k r M S V g";

    private static final Logger LOGGER = Logger.getLogger(DereferencingArchiveToCDXRecordReader.class.getName());
    private LineRecordReader internal = new LineRecordReader();
    private FSDataInputStream datainputstream;
    private FileSystem filesystem;
    private ArchiveReader arcreader;
    private Iterator<CaptureSearchResult> archiveIterator;
    private Iterator<String> cdxlines;
    private WarcIndexer warcIndexer = new WarcIndexer();
    private ArcIndexer arcIndexer = new ArcIndexer();
    private Text key;
    private Text value;
    private CDXFormat cdxFormat;
    private boolean hdfs;
    private int gIndex = -1;
    private String metaTag;
    private int MIndex = -1;
    private HashMap<String, String> warcArkLookup = new HashMap<String, String>();

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        FileSplit fileSplit = (FileSplit) split;
        //
        this.hdfs = Boolean.parseBoolean(conf.get("cdx.hdfs", "false"));
        this.filesystem = fileSplit.getPath().getFileSystem(conf);
        String format = conf.get("cdx.format", CDX_10);
        try {
            this.cdxFormat = new CDXFormat(format);
        } catch (CDXFormatException e) {
            LOGGER.error("initialize(): " + e.getMessage());
        }
        //
        String[] parts = format.substring(5).split(" ");
        for (int i = 0; i < parts.length; i++) {
            if ("g".equals(parts[i])) {
                gIndex = i;
            }
            if ("M".equals(parts[i])) {
                MIndex = i;
            }
        }
        internal.initialize(split, context);
        this.getLookup(conf);
        metaTag = conf.get("cdx.metatag");
        //
        // warcIndexer.setProcessAll(true);
        //
        LOGGER.info("Initialised with format:" + format);
    }

    private void getLookup(Configuration conf) {
        try {
            URI[] uris = DistributedCache.getCacheFiles(conf);
            if (uris != null) {
                for (URI uri : uris) {
                    FSDataInputStream input = this.filesystem.open(new Path(uri.getPath()));
                    BufferedReader reader = new BufferedReader(new InputStreamReader(input));
                    String line;
                    String[] values;
                    while ((line = reader.readLine()) != null) {
                        values = line.split("\\s+");
                        warcArkLookup.put(values[0], values[1]);
                    }
                    System.out.println("Added " + warcArkLookup.size() + " entries to ARK lookup.");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public void close() {
        if (datainputstream != null) {
            try {
                datainputstream.close();
            } catch (IOException e) {
                System.err.println("close(): " + e.getMessage());
            }
        }
        try {
            internal.close();
        } catch (IOException e) {
            LOGGER.error("close(): " + e.getMessage());
        }
    }

    @Override
    public float getProgress() throws IOException {
        return internal.getProgress();
    }

    @Override
    public boolean nextKeyValue() {
        if (this.key == null) {
            this.key = new Text();
        }
        if (this.value == null) {
            this.value = new Text();
        }
        String line;
        while (true) {
            try {
                if (cdxlines != null && cdxlines.hasNext()) {
                    if (this.hdfs && this.gIndex > -1) {
                        line = hdfsPath(cdxlines.next(), this.internal.getCurrentValue().toString());
                    } else {
                        line = cdxlines.next();
                    }
                    if (metaTag != null && MIndex != -1)
                        line = setMetaTag(line);
                    this.key.set(line);
                    this.value.set(line);
                    return true;
                } else {
                    if (this.internal.nextKeyValue()) {
                        Path path = new Path(this.internal.getCurrentValue().toString());
                        datainputstream = this.filesystem.open(path);
                        arcreader = ArchiveReaderFactory.get(path.getName(), datainputstream, true);
                        arcreader.setStrict(false);
                        if (path.getName().matches("^.+\\.warc(\\.gz)?$")) {
                            archiveIterator = warcIndexer.iterator((WARCReader) arcreader);
                        } else {
                            archiveIterator = arcIndexer.iterator((ARCReader) arcreader);
                        }
                        cdxlines = SearchResultToCDXFormatAdapter.adapt(archiveIterator, cdxFormat);
                        LOGGER.info("Started reader on " + path.getName());
                    } else {
                        return false;
                    }
                }
            } catch (Exception e) {
                LOGGER.error("nextKeyValue: " + e.getMessage());
            }
        }
    }

    /*
     * So, to get the compressed length, I need to iterate the ARC/WARC reader
     * myself, always getting the current and the next values, and using the
     * offsets of each. i.e. we'd store the current and next, not just the
     * current, and swap them as part of the 'get next'
     */

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return this.key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return this.value;
    }

    private String hdfsPath(String cdx, String path) throws URISyntaxException {
        String[] fields = cdx.split(" ");
        if (warcArkLookup.size() != 0) {
            fields[gIndex] = warcArkLookup.get(fields[gIndex]) + "#" + fields[gIndex];
        } else {
            fields[gIndex] = path;
        }
        return StringUtils.join(fields, " ");
    }

    /**
     * We use the "meta tags" field to identify the CDX's licence.
     * @param cdx
     * @return
     */
    private String setMetaTag(String cdx) {
        String[] fields = cdx.split(" ");
        if (fields[MIndex].equals("-")) {
            fields[MIndex] = metaTag;
        } else {
            fields[MIndex] = metaTag + fields[MIndex];
        }
        return StringUtils.join(fields, " ");
    }
}