uk.bl.wa.hadoop.mapreduce.WebArchiveRecordReader.java Source code

Introduction

Here is the source code for uk.bl.wa.hadoop.mapreduce.WebArchiveRecordReader.java
Source

/**
 * 
 */
package uk.bl.wa.hadoop.mapreduce;

/*
 * #%L
 * warc-hadoop-recordreaders
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;

import uk.bl.wa.hadoop.WritableArchiveRecord;

/**
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 *
 */
public class WebArchiveRecordReader extends RecordReader<Text, WritableArchiveRecord> {

    private static Logger log = Logger.getLogger(WebArchiveRecordReader.class.getName());

    private FSDataInputStream datainputstream;
    private FileStatus status;
    private FileSystem filesystem;
    private Path[] paths;
    int currentPath = -1;
    Long offset = 0L;
    private ArchiveReader arcreader;
    private Iterator<ArchiveRecord> iterator;
    private ArchiveRecord record;
    private String archiveName;

    private Text key = new Text();
    private WritableArchiveRecord value = new WritableArchiveRecord();

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        if (split instanceof FileSplit) {
            this.paths = new Path[1];
            this.paths[0] = ((FileSplit) split).getPath();
        } else {
            throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
        }
        // get correct file system in case there are many (such as in EMR)
        this.filesystem = FileSystem.get(this.paths[0].toUri(), context.getConfiguration());

        // Log the paths and check for empty files:
        List<Path> validPaths = new ArrayList<Path>();
        for (Path p : this.paths) {
            log.info("Processing path: " + p);
            FileStatus s = this.filesystem.getFileStatus(p);
            if (s.getLen() == 0) {
                log.warn("Skipping empty file: " + p);
            } else {
                validPaths.add(p);
            }
        }
        // Use this list instead:
        this.paths = validPaths.toArray(new Path[0]);

    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (datainputstream != null && this.status != null) {
            return (float) datainputstream.getPos() / (float) this.status.getLen();
        } else {
            return 1.0f;
        }
    }

    @Override
    public void close() throws IOException {
        if (datainputstream != null) {
            try {
                datainputstream.close();
            } catch (IOException e) {
                log.error("close(): " + e.getMessage());
            }
        }
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return this.key;
    }

    @Override
    public WritableArchiveRecord getCurrentValue() throws IOException, InterruptedException {
        return this.value;
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean found = false;
        while (!found) {
            boolean hasNext = false;
            try {
                hasNext = iterator.hasNext();
            } catch (Throwable e) {
                log.error("ERROR in hasNext():  " + this.archiveName + ": " + e.toString());
                hasNext = false;
            }
            try {
                if (hasNext) {
                    record = (ArchiveRecord) iterator.next();
                    found = true;
                    this.key.set(this.archiveName);
                    this.value.setRecord(record);
                } else if (!this.nextFile()) {
                    break;
                }
            } catch (Throwable e) {
                found = false;
                log.error("ERROR reading " + this.archiveName, e);
                // Reached the end of the file? If so move on or exit:
                if (e.getCause() instanceof EOFException) {
                    log.error("EOF while reading " + this.archiveName);
                    if (!this.nextFile()) {
                        break;
                    }
                }
            }
        }
        return found;
    }

    private boolean nextFile() throws IOException {
        currentPath++;
        if (currentPath >= paths.length) {
            return false;
        }
        // Output the archive filename, to help with debugging:
        log.info("Opening nextFile: " + paths[currentPath]);
        // Set up the ArchiveReader:
        this.status = this.filesystem.getFileStatus(paths[currentPath]);
        datainputstream = this.filesystem.open(paths[currentPath]);
        arcreader = (ArchiveReader) ArchiveReaderFactory.get(paths[currentPath].getName(), datainputstream, true);
        // Set to strict reading, in order to cope with malformed archive files
        // which cause an infinite loop otherwise.
        arcreader.setStrict(true);
        // Get the iterator:
        iterator = arcreader.iterator();
        this.archiveName = paths[currentPath].getName();
        return true;
    }

}