long getPos() throws IOException;

Returns the current position in the input.


From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {



        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);


        ZipInputFormat format = new ZipInputFormat();
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {

        assertEquals("some keys in no partition.", length, bits.cardinality());

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

public void testInOutFormat() throws Exception {
    Properties properties = new Properties();
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
    SerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
            properties, Reporter.NULL);
    writer.write(serde.serialize(new MyRow(1, 2), inspector));
    writer.write(serde.serialize(new MyRow(2, 2), inspector));
    writer.write(serde.serialize(new MyRow(3, 2), inspector));
    serde = new OrcSerde();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    serde.initialize(conf, properties);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // the the validate input method
    ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3);
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));

    // read the whole file
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Writable value = (Writable) reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(0.0, reader.getProgress(), 0.00001);
    assertEquals(0, reader.getPos());
    while (reader.next(key, value)) {
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0))));
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    assertEquals(3, rowNum);
    assertEquals(1.0, reader.getProgress(), 0.00001);

    // read just the first column
    conf.set("hive.io.file.readcolumn.ids", "0");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(null, inspector.getStructFieldData(value, fields.get(1)));
    assertEquals(3, rowNum);

    // test the mapping of empty string to all columns
    conf.set("hive.io.file.readcolumn.ids", "");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    assertEquals(3, rowNum);

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

public void testEmptyFile() throws Exception {
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
            properties, Reporter.NULL);/*ww  w. j av a 2 s  .c o  m*/
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    SerDe serde = new OrcSerde();
    serde.initialize(conf, properties);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // read the whole file
    conf.set("hive.io.file.readcolumn.ids", "0,1");
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    assertEquals(0.0, reader.getProgress(), 0.00001);
    assertEquals(0, reader.getPos());
    assertEquals(false, reader.next(key, value));
    assertEquals(null, serde.getSerDeStats());

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java

License:Apache License

public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf job,
        Reporter reporter) throws IOException {
    setColumns(job);
    final RecordReader<ImmutableBytesWritable, Result> rr = delegate
            .getRecordReader(((HBaseSplit) split).getSnapshotSplit(), job, reporter);

    return new RecordReader<ImmutableBytesWritable, ResultWritable>() {
        public boolean next(ImmutableBytesWritable key, ResultWritable value) throws IOException {
            return rr.next(key, value.getResult());

        public ImmutableBytesWritable createKey() {
            return rr.createKey();

        public ResultWritable createValue() {
            return new ResultWritable(rr.createValue());

        public long getPos() throws IOException {
            return rr.getPos();

        public void close() throws IOException {

        public float getProgress() throws IOException {
            return rr.getProgress();

From source file:com.ibm.jaql.io.hadoop.CompositeInputAdapter.java

License:Apache License

public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
    CompositeSplit cSplit = (CompositeSplit) split;

    // 1. get the InputAdapter's array index (i) from the split
    final int idx = cSplit.getAdapterIdx();
    InputSplit baseSplit = cSplit.getSplit();

    try {
        // 2. get the ith adapter's args record
        JsonValue value = this.args.get(idx);
        // JRecord baseArgs = (JRecord) item.getNonNull();
        // record the current index to the job conf
        // ASSUMES: in map/reduce, the format's record reader is called *before*
        // the map class is configured
        writeCurrentIndex(job, idx); // FIXME: no longer needed

        // 3. insantiate and initialize the adapter
        HadoopInputAdapter adapter = (HadoopInputAdapter) AdapterStore.getStore().input
                .getAdapter(/** baseArgs, */

        // 4. create a new JobConf j'
        JobConf jTmp = new JobConf(job);

        // 5. call adapter's setupConf(j')
        // ConfiguratorUtil.writeToConf(adapter, jTmp, item/**baseArgs*/);

        // 6. configure the adapter from j'

        // 7. call adapter's getRecordReader with j'
        final RecordReader<JsonHolder, JsonHolder> reader = (RecordReader<JsonHolder, JsonHolder>) adapter
                .getRecordReader(baseSplit, jTmp, reporter);

        if (!addIndex) {
            return reader;

        return new RecordReader<JsonHolder, JsonHolder>() {

            public void close() throws IOException {

            public JsonHolder createKey() {
                return reader.createKey();

            public JsonHolder createValue() {
                return reader.createValue();

            public long getPos() throws IOException {
                return reader.getPos();

            public float getProgress() throws IOException {
                return reader.getProgress();

            public boolean next(JsonHolder key, JsonHolder value) throws IOException {
                BufferedJsonArray pair = (BufferedJsonArray) value.value;
                if (pair != null) {
                    value.value = pair.get(1);
                } else {
                    pair = new BufferedJsonArray(2);
                    pair.set(0, JsonLong.make(idx));

                if (reader.next(key, value)) {
                    pair.set(1, value.value);
                    value.value = pair;
                    return true;

                return false;

    } catch (Exception e) {
        return null;

From source file:com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java

License:Apache License

public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
    if (split instanceof DHIASplit) {
        // not using order-preserving wrapper
        split = ((DHIASplit) split).split;
    }/*from   w  ww .  java2  s.  c o  m*/

    if (converter == null)
        return ((InputFormat<JsonHolder, JsonHolder>) iFormat).getRecordReader(split, job, reporter);
    final RecordReader<K, V> baseReader = ((InputFormat<K, V>) iFormat).getRecordReader(split, job, reporter);
    final K baseKey = baseReader.createKey();
    final V baseValue = baseReader.createValue();

    return new RecordReader<JsonHolder, JsonHolder>() {

        public void close() throws IOException {

        public JsonHolder createKey() {
            return keyHolder();

        public JsonHolder createValue() {
            JsonHolder holder = valueHolder();
            holder.value = converter.createTarget();
            return holder;

        public long getPos() throws IOException {
            return baseReader.getPos();

        public float getProgress() throws IOException {
            return baseReader.getProgress();

        public boolean next(JsonHolder key, JsonHolder value) throws IOException {
            boolean hasMore = baseReader.next(baseKey, baseValue);
            if (!hasMore)
                return false;
            value.value = converter.convert(baseKey, baseValue, value.value);
            return true;