Example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader

List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader.

Prototype

public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Create a record reader for a given split.

Usage

From source file:org.warcbase.mapreduce.WacGenericInputFormatTest.java

License:Apache License

@Test
public void testArcInputFormat() throws Exception {
    String[] urls = new String[] { "filedesc://IAH-20080430204825-00000-blackbook.arc", "dns:www.archive.org",
            "http://www.archive.org/robots.txt", "http://www.archive.org/",
            "http://www.archive.org/index.php" };

    String arcFile = Resources.getResource("arc/example.arc.gz").getPath();

    Configuration conf = new Configuration(false);
    conf.set("fs.defaultFS", "file:///");

    File testFile = new File(arcFile);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
            .newInstance(WacGenericInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
            context);//  ww w . ja  v  a 2  s .co  m

    reader.initialize(split, context);

    int cnt = 0;
    while (reader.nextKeyValue()) {
        ArchiveRecord record = reader.getCurrentValue().getRecord();
        boolean isArc = record instanceof ARCRecord;
        assertTrue(isArc);

        if (isArc) {
            ARCRecord arcRecord = (ARCRecord) record;
            ARCRecordMetaData metadata = arcRecord.getMetaData();

            if (cnt < urls.length) {
                assertEquals(urls[cnt], metadata.getUrl());
            }
        }

        cnt++;
    }
    assertEquals(300, cnt);
}

From source file:org.warcbase.mapreduce.WacGenericInputFormatTest.java

License:Apache License

@Test
public void testWarcInputFormat() throws Exception {
    String[] urls = new String[] { null, "dns:www.archive.org", "http://www.archive.org/robots.txt",
            "http://www.archive.org/robots.txt", "http://www.archive.org/robots.txt", "http://www.archive.org/",
            "http://www.archive.org/", "http://www.archive.org/", "http://www.archive.org/index.php",
            "http://www.archive.org/index.php" };

    String[] types = new String[] { "warcinfo", "response", "response", "request", "metadata", "response",
            "request", "metadata", "response", "request" };

    String arcFile = Resources.getResource("warc/example.warc.gz").getPath();

    Configuration conf = new Configuration(false);
    conf.set("fs.defaultFS", "file:///");

    File testFile = new File(arcFile);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
            .newInstance(WacGenericInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
            context);// w  w  w  .  j  av  a  2 s .c  o  m

    reader.initialize(split, context);

    assertTrue(urls.length == types.length);

    int cnt = 0;
    int responseCnt = 0;
    while (reader.nextKeyValue()) {
        ArchiveRecord record = reader.getCurrentValue().getRecord();
        boolean isWarc = record instanceof WARCRecord;
        assertTrue(isWarc);

        if (isWarc) {
            WARCRecord warcRecord = (WARCRecord) record;
            if (cnt < urls.length) {
                assertEquals(urls[cnt], warcRecord.getHeader().getUrl());
                assertEquals(types[cnt], warcRecord.getHeader().getHeaderValue("WARC-Type"));
            }

            if (warcRecord.getHeader().getHeaderValue("WARC-Type").equals("response")) {
                responseCnt++;
            }
        }

        cnt++;
    }
    assertEquals(822, cnt);
    assertEquals(299, responseCnt);
}