Example usage for org.apache.hadoop.mapred RecordReader createKey

List of usage examples for org.apache.hadoop.mapred RecordReader createKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RecordReader createKey.

Prototype

K createKey();

Source Link

Document

Create an object of the appropriate type to be used as a key.

Usage

From source file:TestFormatStorageRecordReader.java

License:Open Source License

public static void main(String[] argv) throws IOException {
    try {/*from  w  w  w .j av a  2  s  . com*/
        String path1 = "se_test/fs/basic/f1/kt/";
        String path2 = "se_test/fs/basic/f2/";

        initFormatData();

        JobConf conf1 = new JobConf(TestFormatStorageRecordReader.class);
        JobConf conf2 = new JobConf(TestFormatStorageRecordReader.class);

        FormatStorageSerDe serDe1 = initSerDe(conf1);
        FormatStorageSerDe serDe2 = initSerDe(conf2);

        StandardStructObjectInspector oi1 = (StandardStructObjectInspector) serDe1.getObjectInspector();
        List<? extends StructField> fieldRefs1 = oi1.getAllStructFieldRefs();

        StandardStructObjectInspector oi2 = (StandardStructObjectInspector) serDe2.getObjectInspector();
        List<? extends StructField> fieldRefs2 = oi2.getAllStructFieldRefs();

        InputFormat inputFormat = new FormatStorageInputFormat();
        RecordReader<WritableComparable, Writable> currRecReader1 = getRecReader(conf1, path1);
        WritableComparable key;
        Writable value;

        key = currRecReader1.createKey();
        value = currRecReader1.createValue();
        System.out.println("currRecReader1. output....");
        while (currRecReader1.next(key, value)) {
            ((Record) value).show();
            System.out.println("end value.show");
            Object row = serDe1.deserialize((Record) value);
            Record record = (Record) serDe1.serialize(row, oi1);
            record.show();

        }
        /*
        RecordReader<WritableComparable, Writable> currRecReader2 = getRecReader(conf2, path2);                
        key = currRecReader2.createKey();
        value = currRecReader2.createValue();
        System.out.println("currRecReader2. output....");
        while (currRecReader2.next(key, value))
        {
        ((Record)value).show();
        }
                
        RecordReader<WritableComparable, Writable> currRecReader3 = getRecReader(conf1, path1);            
        key = currRecReader3.createKey();
        value = currRecReader3.createValue();
        System.out.println("currRecReader3. output....");
        while (currRecReader3.next(key, value))
        {
        ((Record)value).show();
        }
        */
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:RunText.java

License:Apache License

@Override
public void run() {
    try {/*from  w w w  .j  a va2 s.com*/
        JobConf job = new JobConf();
        job.setInputFormat(format.getClass());
        RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, Reporter.NULL);
        Text value = reader.createValue();
        LongWritable key = reader.createKey();
        int count = 0;
        long t1 = System.nanoTime();
        while (reader.next(key, value)) {
            List<String> values = parse(value);
            if (values.get(index).equals(toFind)) {
                System.out.println(value);
            }
            count++;
            if (count == 100) {
                totalCount.addAndGet(100);
                count = 0;
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        runningThreads.decrementAndGet();
    }
}

From source file:Text2FormatStorageMR.java

License:Open Source License

@SuppressWarnings("unchecked")
public static int readFormatFile(JobConf conf, String inputPath, int lineNum) throws Exception {

    RecordReader<WritableComparable, Writable> currRecReader;

    conf.set("mapred.input.dir", inputPath);

    InputFormat inputFormat = new FormatStorageInputFormat();
    InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
    if (inputSplits.length == 0) {
        System.out.println("inputSplits is empty");
        return -1;
    }//  w ww .  j av a  2 s . com

    currRecReader = inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL);

    WritableComparable key;
    Writable value;

    key = currRecReader.createKey();
    value = currRecReader.createValue();

    int num = 0;

    while (true) {
        boolean ret = currRecReader.next(key, value);
        if (ret) {
            Text Line = (Text) key;
            System.out.println(Line.toString());
            num++;
            if (num >= lineNum)
                break;

        } else
            break;
    }

    return 0;
}

From source file:TestFormatStorageInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {/*  www .  j av a 2  s. c om*/
        if (argv.length != 2) {
            System.out.println("TestFormatStorageInputFormat <input> <output>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestFormatStorageInputFormat.class);

        conf.setJobName("TestFormatStorageInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(FormatStorageOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        Head head = new Head();
        initHead(head);

        head.toJobConf(conf);

        FormatStorageSerDe serDe = initSerDe(conf);
        StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new FormatStorageInputFormat();
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int size = inputSplits.length;
        System.out.println("getSplits return size:" + size);
        for (int i = 0; i < size; i++) {
            FormatStorageSplit split = (FormatStorageSplit) inputSplits[i];
            System.out.printf("split:" + i + "offset:" + split.getStart() + "len:" + split.getLength() + "path:"
                    + conf.get(ConstVar.InputPath) + "beginLine:" + split.getBeginLine() + "endLine:"
                    + split.getEndLine() + "\n");
        }

        {
            int totalDelay = 0;
            RecordReader<WritableComparable, Writable> currRecReader = null;
            for (int i = 0; i < inputSplits.length; i++) {
                currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

                WritableComparable key;
                Writable value;

                key = currRecReader.createKey();
                value = currRecReader.createValue();

                long begin = System.currentTimeMillis();
                int count = 0;
                while (currRecReader.next(key, value)) {
                    Record record = (Record) value;

                    Object row = serDe.deserialize(record);
                    count++;
                }
                long end = System.currentTimeMillis();

                long delay = (end - begin) / 1000;
                totalDelay += delay;
                System.out.println(count + " record read over, delay " + delay + " s");
            }

            System.out.println("total delay:" + totalDelay);
        }
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:TestTextInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {/*  w w w . ja va  2  s  . c om*/
        if (argv.length != 2) {
            System.out.println("TestTextInputFormat <input> <output>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestTextInputFormat.class);

        conf.setJobName("TestTextInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(FormatStorageOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        LazySimpleSerDe serDe = initSerDe(conf);
        LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new TextInputFormat();
        ((TextInputFormat) inputFormat).configure(conf);
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int totalDelay = 0;
        RecordReader<WritableComparable, Writable> currRecReader = null;
        for (int i = 0; i < inputSplits.length; i++) {
            currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

            WritableComparable key;
            Writable value;

            key = currRecReader.createKey();
            value = currRecReader.createValue();

            long begin = System.currentTimeMillis();
            int count = 0;
            while (currRecReader.next(key, value)) {

                Object row = serDe.deserialize((Text) value);
                oi.getStructFieldsDataAsList(row);

                count++;
            }
            long end = System.currentTimeMillis();

            long delay = (end - begin) / 1000;
            totalDelay += delay;
            System.out.println(count + " record read over, delay " + delay + " s");
        }

        System.out.println("total delay:" + totalDelay);

        return;
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:TestColumnStorageInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {//w w  w .j a v  a2 s  .c  o m
        if (argv.length != 2) {
            System.out.println("TestColumnStorageInputFormat <input> idx");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestColumnStorageInputFormat.class);

        conf.setJobName("TestColumnStorageInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        conf.set("hive.io.file.readcolumn.ids", argv[1]);

        FormatStorageSerDe serDe = initSerDe(conf);
        StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new ColumnStorageInputFormat();
        long begin = System.currentTimeMillis();
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        long end = System.currentTimeMillis();
        System.out.println("getsplit delay " + (end - begin) + " ms");

        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int size = inputSplits.length;
        System.out.println("getSplits return size:" + size);
        for (int i = 0; i < size; i++) {
            ColumnStorageSplit split = (ColumnStorageSplit) inputSplits[i];
            System.out.printf("split:" + i + " offset:" + split.getStart() + "len:" + split.getLength()
                    + "path:" + split.getPath().toString() + "beginLine:" + split.getBeginLine() + "endLine:"
                    + split.getEndLine());
            if (split.getFileName() != null) {
                System.out.println("fileName:" + split.getFileName());
            } else {
                System.out.println("fileName null");
            }
            if (split.fileList() != null) {
                System.out.println("fileList.num:" + split.fileList().size());
                for (int j = 0; j < split.fileList().size(); j++) {
                    System.out.println("filelist " + j + ":" + split.fileList().get(j));
                }
            }
        }

        while (true) {
            int totalDelay = 0;
            RecordReader<WritableComparable, Writable> currRecReader = null;
            for (int i = 0; i < inputSplits.length; i++) {
                currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

                WritableComparable key;
                Writable value;

                key = currRecReader.createKey();
                value = currRecReader.createValue();

                begin = System.currentTimeMillis();
                int count = 0;
                while (currRecReader.next(key, value)) {

                    Record record = (Record) value;

                    Object row = serDe.deserialize(record);
                    count++;

                }
                end = System.currentTimeMillis();

                long delay = (end - begin) / 1000;
                totalDelay += delay;
                System.out.println(count + " record read over, delay " + delay + " s");
            }

            System.out.println("total delay:" + totalDelay + "\n");
        }

    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:cascading.avro.AvroScheme.java

License:Apache License

/**
 * Source method to take an incoming Avro record and make it a Tuple.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param sourceCall  The cascading SourceCall object. Should be passed in by cascading automatically.
 * @return boolean true on successful parsing and collection, false on failure.
 * @throws IOException//w  w w  .j av  a  2  s  . c o m
 */
@Override
public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall)
        throws IOException {

    @SuppressWarnings("unchecked")
    RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput();
    AvroWrapper<IndexedRecord> wrapper = input.createKey();
    if (!input.next(wrapper, input.createValue())) {
        return false;
    }
    IndexedRecord record = wrapper.datum();
    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
    tuple.clear();

    Object[] split = AvroToCascading.parseRecord(record, schema);
    tuple.addAll(split);

    return true;
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * Source method to take an incoming Avro record and make it a Tuple.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param sourceCall  The cascading SourceCall object. Should be passed in by cascading automatically.
 * @return boolean true on successful parsing and collection, false on failure.
 * @throws IOException/* w ww  . j  av a2  s .c  o m*/
 */
@Override
public boolean source(FlowProcess<? extends Configuration> flowProcess,
        SourceCall<Object[], RecordReader> sourceCall) throws IOException {

    @SuppressWarnings("unchecked")
    RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput();
    AvroWrapper<IndexedRecord> wrapper = input.createKey();
    if (!input.next(wrapper, input.createValue())) {
        return false;
    }
    IndexedRecord record = wrapper.datum();
    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
    tuple.clear();

    Object[] split = AvroToCascading.parseRecord(record, schema);
    tuple.addAll(split);

    return true;
}

From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java

License:Apache License

@Override
public Object[] getSample(InputFormat inf, JobConf job) throws IOException {
    // the following codes are copied from {@link InputSampler#RandomSampler},
    // but require some modifications.

    InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
    ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples);
    int splitsToSample = Math.min(this.maxSplitsSampled, splits.length);

    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);//from   w  w w.  ja v  a  2  s.c o  m

    // get Sorters
    Sorter[] sorters = null;
    if (job.get(ConfigureConstants.SORTERS, null) != null) {
        // total sort job
        sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job);
    } else {
        // there is no sorter, should be reducer/join job
        Column[] keys = (Column[]) SerializableUtil
                .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job);
        sorters = new Sorter[keys.length];
        for (int i = 0; i < keys.length; i++) {
            sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC);
        }
    }

    long proportion = 10L;
    while ((int) (this.freq * proportion) == 0) {
        proportion = proportion * 10;
    }
    proportion = 5L * proportion;

    // shuffle splits
    for (int i = 0; i < splits.length; ++i) {
        InputSplit tmp = splits[i];
        int j = r.nextInt(splits.length);
        splits[i] = splits[j];
        splits[j] = tmp;
    }

    SamplingOutputCollector collector = new SamplingOutputCollector();
    for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) {
        LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size());

        RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job,
                Reporter.NULL);
        WritableComparable key = reader.createKey();
        WritableComparable value = reader.createValue();

        if (!(inf instanceof MobiusDelegatingInputFormat)) {
            // not mobius delegating input format, so the CURRENT_DATASET_ID
            // will not be set by inf#getRecordReader, we set them here.
            //
            // set the current dataset id, as the AbstractMobiusMapper#configure
            // method needs this property.
            job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS));
        }

        Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID));
        LOGGER.info("Samples coming from dataset: " + datasetID.toString());
        AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job);
        mapper.configure(job);

        // reading elements from one split
        long readElement = 0;
        while (reader.next(key, value)) {
            collector.clear();
            Tuple tuple = mapper.parse(key, value);

            readElement++;
            if (readElement > (((long) numSamples) * ((long) proportion))) {
                // a split might be very big (ex: a large gz file),
                // so we just need to read the 
                break;
            }

            if (r.nextDouble() <= freq) {
                if (samples.size() < numSamples) {
                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    // joinmap function might generate more than one output key
                    // per <code>key</code> input. 
                    for (Tuple t : collector.getOutKey()) {
                        Tuple mt = Tuple.merge(tuple, t);
                        DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                        samples.add(nkey);
                    }
                } else {
                    // When exceeding the maximum number of samples, replace
                    // a random element with this one, then adjust the
                    // frequency to reflect the possibility of existing 
                    // elements being pushed out

                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    for (Tuple t : collector.getOutKey()) {
                        int ind = r.nextInt(numSamples);
                        if (ind != numSamples) {
                            Tuple mt = Tuple.merge(tuple, t);
                            DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                            samples.set(ind, nkey);
                        }
                    }

                    freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples;
                }
                key = reader.createKey();
                value = reader.createValue();
            }
        }
        reader.close();
    }
    LOGGER.info("Samples have been collected, return.");
    return samples.toArray();
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testInOutFormat() throws Exception {
    Properties properties = new Properties();
    StructObjectInspector inspector;//from  ww  w  . j ava2 s.co  m
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
            properties, Reporter.NULL);
    ReaderWriterProfiler.setProfilerOptions(conf);
    writer.write(serde.serialize(new MyRow(1, 2), inspector));
    writer.write(serde.serialize(new MyRow(2, 2), inspector));
    writer.write(serde.serialize(new MyRow(3, 2), inspector));
    writer.close(true);
    serde = new OrcSerde();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    serde.initialize(conf, properties);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // the the validate input method
    ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3);
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    fileList.add(fs.getFileStatus(testFilePath));
    assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
    fileList.add(fs.getFileStatus(workDir));
    assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));

    // read the whole file
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Writable value = (Writable) reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(0.0, reader.getProgress(), 0.00001);
    assertEquals(0, reader.getPos());
    while (reader.next(key, value)) {
        assertEquals(++rowNum,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0))));
        assertEquals(2,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    }
    assertEquals(3, rowNum);
    assertEquals(1.0, reader.getProgress(), 0.00001);
    reader.close();

    // read just the first column
    conf.set("hive.io.file.readcolumn.ids", "0");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(null, inspector.getStructFieldData(value, fields.get(1)));
    }
    assertEquals(3, rowNum);
    reader.close();

    // test the mapping of empty string to all columns
    conf.set("hive.io.file.readcolumn.ids", "");
    reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    key = reader.createKey();
    value = (Writable) reader.createValue();
    rowNum = 0;
    fields = inspector.getAllStructFieldRefs();
    while (reader.next(key, value)) {
        assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(2,
                intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
    }
    assertEquals(3, rowNum);
    reader.close();
}