List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader
public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.warcbase.mapreduce.WacGenericInputFormatTest.java
License:Apache License
@Test public void testArcInputFormat() throws Exception { String[] urls = new String[] { "filedesc://IAH-20080430204825-00000-blackbook.arc", "dns:www.archive.org", "http://www.archive.org/robots.txt", "http://www.archive.org/", "http://www.archive.org/index.php" }; String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); Configuration conf = new Configuration(false); conf.set("fs.defaultFS", "file:///"); File testFile = new File(arcFile); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils .newInstance(WacGenericInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split, context);// ww w . ja v a 2 s .co m reader.initialize(split, context); int cnt = 0; while (reader.nextKeyValue()) { ArchiveRecord record = reader.getCurrentValue().getRecord(); boolean isArc = record instanceof ARCRecord; assertTrue(isArc); if (isArc) { ARCRecord arcRecord = (ARCRecord) record; ARCRecordMetaData metadata = arcRecord.getMetaData(); if (cnt < urls.length) { assertEquals(urls[cnt], metadata.getUrl()); } } cnt++; } assertEquals(300, cnt); }
From source file:org.warcbase.mapreduce.WacGenericInputFormatTest.java
License:Apache License
@Test public void testWarcInputFormat() throws Exception { String[] urls = new String[] { null, "dns:www.archive.org", "http://www.archive.org/robots.txt", "http://www.archive.org/robots.txt", "http://www.archive.org/robots.txt", "http://www.archive.org/", "http://www.archive.org/", "http://www.archive.org/", "http://www.archive.org/index.php", "http://www.archive.org/index.php" }; String[] types = new String[] { "warcinfo", "response", "response", "request", "metadata", "response", "request", "metadata", "response", "request" }; String arcFile = Resources.getResource("warc/example.warc.gz").getPath(); Configuration conf = new Configuration(false); conf.set("fs.defaultFS", "file:///"); File testFile = new File(arcFile); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils .newInstance(WacGenericInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split, context);// w w w . j av a 2 s .c o m reader.initialize(split, context); assertTrue(urls.length == types.length); int cnt = 0; int responseCnt = 0; while (reader.nextKeyValue()) { ArchiveRecord record = reader.getCurrentValue().getRecord(); boolean isWarc = record instanceof WARCRecord; assertTrue(isWarc); if (isWarc) { WARCRecord warcRecord = (WARCRecord) record; if (cnt < urls.length) { assertEquals(urls[cnt], warcRecord.getHeader().getUrl()); assertEquals(types[cnt], warcRecord.getHeader().getHeaderValue("WARC-Type")); } if (warcRecord.getHeader().getHeaderValue("WARC-Type").equals("response")) { responseCnt++; } } cnt++; } assertEquals(822, cnt); assertEquals(299, responseCnt); }