Example usage for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:org.mrgeo.format.AutoFeatureInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, Geometry> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
    RecordReader<LongWritable, Geometry> result = new AutoRecordReader();
    result.initialize(split, context);
    return result;
}

From source file:org.mrgeo.format.AutoGeometryInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, GeometryWritable> createRecordReader(InputSplit split,
        TaskAttemptContext context) throws IOException, InterruptedException {
    RecordReader<LongWritable, GeometryWritable> result = new AutoRecordReader();
    result.initialize(split, context);
    return result;
}

From source file:org.mrgeo.format.CsvInputFormatTest.java

License:Apache License

@Test
@Category(UnitTest.class)
public void testNullIgnore() throws Exception {
    FileSystem fs = new RawLocalFileSystem();
    try {//from   ww  w .  ja  va 2s . c  om
        int lineCount = 0;

        // Write columns file which defines the columns title and type
        String cstr = "<?xml version='1.0' encoding='UTF-8'?>\n<AllColumns firstLineHeader='false'>\n";
        cstr += "  <Column name='name' type='Nominal'/>\n";
        cstr += "  <Column name='x' type='Numeric'/>\n";
        cstr += "  <Column name='y' type='Numeric'/>\n";
        cstr += "</AllColumns>\n";
        FileOutputStream fos = new FileOutputStream(output + "/nulXY.csv.columns");
        PrintStream ps = new PrintStream(fos);
        ps.print(cstr);
        ps.close();

        // Write csv test data
        fos = new FileOutputStream(output + "/nullXY.csv");
        ps = new PrintStream(fos);
        // populated rows
        for (int ii = 0; ii < 10; ii++) {
            ps.print("ASDF,1.0,1.0\n");
            lineCount++;
        }
        // empty rows
        ps.print("ASDF,,1.0\n");
        ps.print("ASDF,1.0,\n");
        ps.print("ASDF,,\n");
        lineCount += 3;
        // populated rows
        for (int ii = 0; ii < 5; ii++) {
            ps.print("ASDF,1.0,1.0\n");
            lineCount++;
        }
        ps.close();

        System.out.println(output + "nulXY.csv");

        Job j = new Job(new Configuration());
        Configuration c = j.getConfiguration();
        fs.setConf(c);
        Path testFile = new Path(output, "nullXY.csv");
        testFile = fs.makeQualified(testFile);
        InputSplit split;
        long l;
        long start;

        TextInputFormat format = new TextInputFormat();
        split = new FileSplit(testFile, 0, lineCount * 1000, null);
        RecordReader<LongWritable, Text> reader2 = format.createRecordReader(split,
                HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID()));

        reader2.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID()));
        l = 0;
        start = System.currentTimeMillis();
        while (reader2.nextKeyValue()) {
            reader2.getCurrentValue().toString();
            l++;
        }
        Assert.assertEquals(lineCount, l);
        System.out.printf("text line reader with null x,y ignore: %d\n", System.currentTimeMillis() - start);

    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    } finally {
        fs.close();
    }
}

From source file:org.mrgeo.hdfs.ingest.format.IngestImageSplittingInputFormat.java

License:Apache License

@Override
public RecordReader<TileIdWritable, RasterWritable> createRecordReader(final InputSplit split,
        final TaskAttemptContext context) throws IOException, InterruptedException {
    final RecordReader<TileIdWritable, RasterWritable> reader = new IngestImageSplittingRecordReader();
    //FIXME: This seems to be called from AutoFeatureInputFormat.initialize()
    reader.initialize(split, context);

    return reader;
}

From source file:org.mrgeo.hdfs.vector.ShpInputFormatTest.java

License:Apache License

public RecordReader<FeatureIdWritable, Geometry> openReader(Path p) throws IOException, InterruptedException {
    Job j = new Job(new Configuration());
    Configuration c = j.getConfiguration();
    try (FileSystem fs = new RawLocalFileSystem()) {
        fs.setConf(c);/* w  w w.j  av a  2  s  .  c  o m*/
        Path testFile = fs.makeQualified(p);

        c.set("mapred.input.dir", testFile.toString());
        ShpInputFormat format = new ShpInputFormat();
        InputSplit split = format.getSplits(j).get(0);
        TaskAttemptContext context = HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID());
        RecordReader<FeatureIdWritable, Geometry> reader = format.createRecordReader(split, context);
        reader.initialize(split, context);
        return reader;
    }
}

From source file:org.mrgeo.mapreduce.ingestvector.IngestVectorGeometryInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, GeometryWritable> createRecordReader(InputSplit split,
        TaskAttemptContext context) throws IOException, InterruptedException {
    final RecordReader<LongWritable, GeometryWritable> reader = new IngestVectorRecordReader();

    reader.initialize(split, context);

    return reader;
}

From source file:org.msgpack.hadoop.mapreduce.input.TestMessagePackInputFormat.java

License:Apache License

void checkFormat(Job job) throws Exception {
    TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(),
            new TaskAttemptID("123", 0, false, 1, 2));

    MessagePackInputFormat format = new MessagePackInputFormat();
    FileInputFormat.setInputPaths(job, workDir);

    List<InputSplit> splits = format.getSplits(job);
    assertEquals(1, splits.size());//from   w  ww  . j  a va  2 s . c  om
    for (int j = 0; j < splits.size(); j++) {
        RecordReader<LongWritable, MessagePackWritable> reader = format.createRecordReader(splits.get(j),
                attemptContext);
        reader.initialize(splits.get(j), attemptContext);

        int count = 0;
        try {
            while (reader.nextKeyValue()) {
                LongWritable key = reader.getCurrentKey();
                Value val = reader.getCurrentValue().get();
                assertEquals(count, val.asIntegerValue().getLong());
                count++;
            }
        } finally {
            reader.close();
        }
    }
}

From source file:org.tensorflow.hadoop.io.TFRecordFileTest.java

License:Open Source License

@Test
public void testInputOutputFormat() throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);//from w w  w  .jav a2  s. c om

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "tfr-test");

    TFRecordFileOutputFormat.setOutputPath(job, outdir);

    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, NullWritable> outputFormat = new TFRecordFileOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, NullWritable> writer = outputFormat.getRecordWriter(context);

    // Write Example with random numbers
    Random rand = new Random();
    Map<Long, Long> records = new TreeMap<Long, Long>();
    try {
        for (int i = 0; i < RECORDS; ++i) {
            long randValue = rand.nextLong();
            records.put((long) i, randValue);
            Int64List data = Int64List.newBuilder().addValue(i).addValue(randValue).build();
            Feature feature = Feature.newBuilder().setInt64List(data).build();
            Features features = Features.newBuilder().putFeature("data", feature).build();
            Example example = Example.newBuilder().setFeatures(features).build();
            BytesWritable key = new BytesWritable(example.toByteArray());
            writer.write(key, NullWritable.get());
        }
    } finally {
        writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    // Read and compare
    TFRecordFileInputFormat.setInputPaths(job, outdir);
    InputFormat<BytesWritable, NullWritable> inputFormat = new TFRecordFileInputFormat();
    for (InputSplit split : inputFormat.getSplits(job)) {
        RecordReader<BytesWritable, NullWritable> reader = inputFormat.createRecordReader(split, context);
        MapContext<BytesWritable, NullWritable, BytesWritable, NullWritable> mcontext = new MapContextImpl<BytesWritable, NullWritable, BytesWritable, NullWritable>(
                job.getConfiguration(), context.getTaskAttemptID(), reader, null, null,
                MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            while (reader.nextKeyValue()) {
                BytesWritable bytes = reader.getCurrentKey();
                Example example = Example.parseFrom(bytes.getBytes());
                Int64List data = example.getFeatures().getFeatureMap().get("data").getInt64List();
                Long key = data.getValue(0);
                Long value = data.getValue(1);
                assertEquals(records.get(key), value);
                records.remove(key);
            }
        } finally {
            reader.close();
        }
    }
    assertEquals(0, records.size());
}

From source file:org.warcbase.io.GenericArchiveRecordWritableTest.java

License:Apache License

@Test
public void testArcInputFormat() throws Exception {
    String arcFile = Resources.getResource("arc/example.arc.gz").getPath();

    Configuration conf = new Configuration(false);
    conf.set("fs.defaultFS", "file:///");

    File testFile = new File(arcFile);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
            .newInstance(WacGenericInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
            context);/*  w ww  .jav a 2s. c  om*/

    reader.initialize(split, context);

    int cnt = 0;
    while (reader.nextKeyValue()) {
        GenericArchiveRecordWritable record = reader.getCurrentValue();
        cnt++;

        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
        DataOutputStream dataOut = new DataOutputStream(bytesOut);

        record.write(dataOut);

        GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable();

        reconstructed.setFormat(ArchiveFormat.ARC);
        reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray())));

        boolean isArc = (record.getFormat() == ArchiveFormat.ARC);
        assertEquals(isArc, true);
        if (isArc) {
            assertEquals(((ARCRecord) record.getRecord()).getMetaData().getUrl(),
                    ((ARCRecord) reconstructed.getRecord()).getMetaData().getUrl());
        }
    }

    assertEquals(300, cnt);
}

From source file:org.warcbase.io.GenericArchiveRecordWritableTest.java

License:Apache License

@Test
public void testWarcInputFormat() throws Exception {
    String warcFile = Resources.getResource("warc/example.warc.gz").getPath();

    Configuration conf = new Configuration(false);
    conf.set("fs.defaultFS", "file:///");

    File testFile = new File(warcFile);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
            .newInstance(WacGenericInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
            context);/* www . j  av a 2s  .com*/

    reader.initialize(split, context);

    int cnt = 0;
    while (reader.nextKeyValue()) {
        GenericArchiveRecordWritable record = reader.getCurrentValue();

        cnt++;

        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
        DataOutputStream dataOut = new DataOutputStream(bytesOut);

        record.write(dataOut);

        GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable();

        reconstructed.setFormat(ArchiveFormat.WARC);
        reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray())));

        boolean isWarc = (record.getFormat() == ArchiveFormat.WARC);
        assertTrue(isWarc);
        if (isWarc) {
            assertEquals(record.getRecord().getHeader().getUrl(),
                    reconstructed.getRecord().getHeader().getUrl());
            assertEquals(record.getRecord().getHeader().getContentLength(),
                    reconstructed.getRecord().getHeader().getContentLength());
        }
    }

    assertEquals(822, cnt);
}