Example usage for org.apache.hadoop.mapred RecordReader getPos

List of usage examples for org.apache.hadoop.mapred RecordReader getPos

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RecordReader getPos.

Prototype

long getPos() throws IOException;

Source Link

Document

Returns the current position in the input.

Usage

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;//from   w  w w . j  ava  2  s  .  c  o m

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testInOutFormat() throws Exception {
    Properties properties = new Properties();
    StructObjectInspector inspector;/*ww  w  . ja va2 s.c o  m*/
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
            properties, Reporter.NULL);
    ReaderWriterProfiler.setProfilerOptions(conf);
    writer.write(serde.serialize(new MyRow(1, 2), inspector));
    writer.write(serde.serialize(new MyRow(2, 2), inspector));
    writer.write(serde.serialize(new MyRow(3, 2), inspector));
    writer.close(true);
    serde = new OrcSerde();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    serde.initialize(conf, properties);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // the the validate input method
    ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3);
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    fileList.add(fs.getFileStatus(testFilePath));
    assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    fileList.add(fs.getFileStatus(workDir));
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));

    // read the whole file
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Writable value = (Writable) reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(0.0, reader.getProgress(), 0.00001);
    assertEquals(0, reader.getPos());
    while (reader.next(key, value)) {
        assertEquals(++rowNum,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0))));
        assertEquals(2,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    }
    assertEquals(3, rowNum);
    assertEquals(1.0, reader.getProgress(), 0.00001);
    reader.close();

    // read just the first column
    conf.set("hive.io.file.readcolumn.ids", "0");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(null, inspector.getStructFieldData(value, fields.get(1)));
    }
    assertEquals(3, rowNum);
    reader.close();

    // test the mapping of empty string to all columns
    conf.set("hive.io.file.readcolumn.ids", "");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(2,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    }
    assertEquals(3, rowNum);
    reader.close();
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testEmptyFile() throws Exception {
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
            properties, Reporter.NULL);/*ww  w. j av a 2 s  .c o  m*/
    writer.close(true);
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    SerDe serde = new OrcSerde();
    serde.initialize(conf, properties);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // read the whole file
    conf.set("hive.io.file.readcolumn.ids", "0,1");
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    assertEquals(0.0, reader.getProgress(), 0.00001);
    assertEquals(0, reader.getPos());
    assertEquals(false, reader.next(key, value));
    reader.close();
    assertEquals(null, serde.getSerDeStats());
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java

License:Apache License

@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf job,
        Reporter reporter) throws IOException {
    setColumns(job);/*from   w  w  w . j a v a2s .c o m*/
    final RecordReader<ImmutableBytesWritable, Result> rr = delegate
            .getRecordReader(((HBaseSplit) split).getSnapshotSplit(), job, reporter);

    return new RecordReader<ImmutableBytesWritable, ResultWritable>() {
        @Override
        public boolean next(ImmutableBytesWritable key, ResultWritable value) throws IOException {
            return rr.next(key, value.getResult());
        }

        @Override
        public ImmutableBytesWritable createKey() {
            return rr.createKey();
        }

        @Override
        public ResultWritable createValue() {
            return new ResultWritable(rr.createValue());
        }

        @Override
        public long getPos() throws IOException {
            return rr.getPos();
        }

        @Override
        public void close() throws IOException {
            rr.close();
        }

        @Override
        public float getProgress() throws IOException {
            return rr.getProgress();
        }
    };
}

From source file:com.ibm.jaql.io.hadoop.CompositeInputAdapter.java

License:Apache License

@SuppressWarnings("unchecked")
public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
    CompositeSplit cSplit = (CompositeSplit) split;

    // 1. get the InputAdapter's array index (i) from the split
    final int idx = cSplit.getAdapterIdx();
    InputSplit baseSplit = cSplit.getSplit();

    try {//from   w  w w  . j  av  a2s.c  om
        // 2. get the ith adapter's args record
        JsonValue value = this.args.get(idx);
        // JRecord baseArgs = (JRecord) item.getNonNull();
        // record the current index to the job conf
        // ASSUMES: in map/reduce, the format's record reader is called *before*
        // the map class is configured
        writeCurrentIndex(job, idx); // FIXME: no longer needed

        // 3. insantiate and initialize the adapter
        HadoopInputAdapter adapter = (HadoopInputAdapter) AdapterStore.getStore().input
                .getAdapter(/** baseArgs, */
                        value);

        // 4. create a new JobConf j'
        JobConf jTmp = new JobConf(job);

        // 5. call adapter's setupConf(j')
        // ConfiguratorUtil.writeToConf(adapter, jTmp, item/**baseArgs*/);
        adapter.setParallel(jTmp);

        // 6. configure the adapter from j'
        adapter.configure(jTmp);

        // 7. call adapter's getRecordReader with j'
        final RecordReader<JsonHolder, JsonHolder> reader = (RecordReader<JsonHolder, JsonHolder>) adapter
                .getRecordReader(baseSplit, jTmp, reporter);

        if (!addIndex) {
            return reader;
        }

        return new RecordReader<JsonHolder, JsonHolder>() {

            @Override
            public void close() throws IOException {
                reader.close();
            }

            @Override
            public JsonHolder createKey() {
                return reader.createKey();
            }

            @Override
            public JsonHolder createValue() {
                return reader.createValue();
            }

            @Override
            public long getPos() throws IOException {
                return reader.getPos();
            }

            @Override
            public float getProgress() throws IOException {
                return reader.getProgress();
            }

            @Override
            public boolean next(JsonHolder key, JsonHolder value) throws IOException {
                BufferedJsonArray pair = (BufferedJsonArray) value.value;
                if (pair != null) {
                    value.value = pair.get(1);
                } else {
                    pair = new BufferedJsonArray(2);
                    pair.set(0, JsonLong.make(idx));
                }

                if (reader.next(key, value)) {
                    pair.set(1, value.value);
                    value.value = pair;
                    return true;
                }

                return false;
            }
        };

    } catch (Exception e) {
        return null;
    }
}

From source file:com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java

License:Apache License

@SuppressWarnings("unchecked")
public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
    if (split instanceof DHIASplit) {
        // not using order-preserving wrapper
        split = ((DHIASplit) split).split;
    }/*from   w  ww .  java2  s.  c o  m*/

    if (converter == null)
        return ((InputFormat<JsonHolder, JsonHolder>) iFormat).getRecordReader(split, job, reporter);
    final RecordReader<K, V> baseReader = ((InputFormat<K, V>) iFormat).getRecordReader(split, job, reporter);
    final K baseKey = baseReader.createKey();
    final V baseValue = baseReader.createValue();

    return new RecordReader<JsonHolder, JsonHolder>() {

        public void close() throws IOException {
            baseReader.close();
        }

        public JsonHolder createKey() {
            return keyHolder();
        }

        public JsonHolder createValue() {
            JsonHolder holder = valueHolder();
            holder.value = converter.createTarget();
            return holder;
        }

        public long getPos() throws IOException {
            return baseReader.getPos();
        }

        public float getProgress() throws IOException {
            return baseReader.getProgress();
        }

        public boolean next(JsonHolder key, JsonHolder value) throws IOException {
            boolean hasMore = baseReader.next(baseKey, baseValue);
            if (!hasMore)
                return false;
            value.value = converter.convert(baseKey, baseValue, value.value);
            return true;
        }
    };
}