Example usage for org.apache.hadoop.mapred RecordReader next

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RecordReader next.

Prototype

boolean next(K key, V value) throws IOException;

Source Link

Document

Reads the next key/value pair from the input for processing.

Usage

From source file:TestFormatStorageRecordReader.java

License:Open Source License

public static void main(String[] argv) throws IOException {
    try {//from ww w . j  av  a2  s  .  c  o  m
        String path1 = "se_test/fs/basic/f1/kt/";
        String path2 = "se_test/fs/basic/f2/";

        initFormatData();

        JobConf conf1 = new JobConf(TestFormatStorageRecordReader.class);
        JobConf conf2 = new JobConf(TestFormatStorageRecordReader.class);

        FormatStorageSerDe serDe1 = initSerDe(conf1);
        FormatStorageSerDe serDe2 = initSerDe(conf2);

        StandardStructObjectInspector oi1 = (StandardStructObjectInspector) serDe1.getObjectInspector();
        List<? extends StructField> fieldRefs1 = oi1.getAllStructFieldRefs();

        StandardStructObjectInspector oi2 = (StandardStructObjectInspector) serDe2.getObjectInspector();
        List<? extends StructField> fieldRefs2 = oi2.getAllStructFieldRefs();

        InputFormat inputFormat = new FormatStorageInputFormat();
        RecordReader<WritableComparable, Writable> currRecReader1 = getRecReader(conf1, path1);
        WritableComparable key;
        Writable value;

        key = currRecReader1.createKey();
        value = currRecReader1.createValue();
        System.out.println("currRecReader1. output....");
        while (currRecReader1.next(key, value)) {
            ((Record) value).show();
            System.out.println("end value.show");
            Object row = serDe1.deserialize((Record) value);
            Record record = (Record) serDe1.serialize(row, oi1);
            record.show();

        }
        /*
        RecordReader<WritableComparable, Writable> currRecReader2 = getRecReader(conf2, path2);                
        key = currRecReader2.createKey();
        value = currRecReader2.createValue();
        System.out.println("currRecReader2. output....");
        while (currRecReader2.next(key, value))
        {
        ((Record)value).show();
        }
                
        RecordReader<WritableComparable, Writable> currRecReader3 = getRecReader(conf1, path1);            
        key = currRecReader3.createKey();
        value = currRecReader3.createValue();
        System.out.println("currRecReader3. output....");
        while (currRecReader3.next(key, value))
        {
        ((Record)value).show();
        }
        */
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:RunText.java

License:Apache License

@Override
public void run() {
    try {/* w  ww. j  ava  2  s  .c  om*/
        JobConf job = new JobConf();
        job.setInputFormat(format.getClass());
        RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, Reporter.NULL);
        Text value = reader.createValue();
        LongWritable key = reader.createKey();
        int count = 0;
        long t1 = System.nanoTime();
        while (reader.next(key, value)) {
            List<String> values = parse(value);
            if (values.get(index).equals(toFind)) {
                System.out.println(value);
            }
            count++;
            if (count == 100) {
                totalCount.addAndGet(100);
                count = 0;
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        runningThreads.decrementAndGet();
    }
}

From source file:Text2FormatStorageMR.java

License:Open Source License

@SuppressWarnings("unchecked")
public static int readFormatFile(JobConf conf, String inputPath, int lineNum) throws Exception {

    RecordReader<WritableComparable, Writable> currRecReader;

    conf.set("mapred.input.dir", inputPath);

    InputFormat inputFormat = new FormatStorageInputFormat();
    InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
    if (inputSplits.length == 0) {
        System.out.println("inputSplits is empty");
        return -1;
    }/*from   w w  w .  ja v a  2s  .  c o  m*/

    currRecReader = inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL);

    WritableComparable key;
    Writable value;

    key = currRecReader.createKey();
    value = currRecReader.createValue();

    int num = 0;

    while (true) {
        boolean ret = currRecReader.next(key, value);
        if (ret) {
            Text Line = (Text) key;
            System.out.println(Line.toString());
            num++;
            if (num >= lineNum)
                break;

        } else
            break;
    }

    return 0;
}

From source file:TestFormatStorageInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {/* ww w  .ja va2 s .c o  m*/
        if (argv.length != 2) {
            System.out.println("TestFormatStorageInputFormat <input> <output>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestFormatStorageInputFormat.class);

        conf.setJobName("TestFormatStorageInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(FormatStorageOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        Head head = new Head();
        initHead(head);

        head.toJobConf(conf);

        FormatStorageSerDe serDe = initSerDe(conf);
        StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new FormatStorageInputFormat();
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int size = inputSplits.length;
        System.out.println("getSplits return size:" + size);
        for (int i = 0; i < size; i++) {
            FormatStorageSplit split = (FormatStorageSplit) inputSplits[i];
            System.out.printf("split:" + i + "offset:" + split.getStart() + "len:" + split.getLength() + "path:"
                    + conf.get(ConstVar.InputPath) + "beginLine:" + split.getBeginLine() + "endLine:"
                    + split.getEndLine() + "\n");
        }

        {
            int totalDelay = 0;
            RecordReader<WritableComparable, Writable> currRecReader = null;
            for (int i = 0; i < inputSplits.length; i++) {
                currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

                WritableComparable key;
                Writable value;

                key = currRecReader.createKey();
                value = currRecReader.createValue();

                long begin = System.currentTimeMillis();
                int count = 0;
                while (currRecReader.next(key, value)) {
                    Record record = (Record) value;

                    Object row = serDe.deserialize(record);
                    count++;
                }
                long end = System.currentTimeMillis();

                long delay = (end - begin) / 1000;
                totalDelay += delay;
                System.out.println(count + " record read over, delay " + delay + " s");
            }

            System.out.println("total delay:" + totalDelay);
        }
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:TestTextInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {/*from  www  .j a  v a 2 s  .  c o m*/
        if (argv.length != 2) {
            System.out.println("TestTextInputFormat <input> <output>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestTextInputFormat.class);

        conf.setJobName("TestTextInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(FormatStorageOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        LazySimpleSerDe serDe = initSerDe(conf);
        LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new TextInputFormat();
        ((TextInputFormat) inputFormat).configure(conf);
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int totalDelay = 0;
        RecordReader<WritableComparable, Writable> currRecReader = null;
        for (int i = 0; i < inputSplits.length; i++) {
            currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

            WritableComparable key;
            Writable value;

            key = currRecReader.createKey();
            value = currRecReader.createValue();

            long begin = System.currentTimeMillis();
            int count = 0;
            while (currRecReader.next(key, value)) {

                Object row = serDe.deserialize((Text) value);
                oi.getStructFieldsDataAsList(row);

                count++;
            }
            long end = System.currentTimeMillis();

            long delay = (end - begin) / 1000;
            totalDelay += delay;
            System.out.println(count + " record read over, delay " + delay + " s");
        }

        System.out.println("total delay:" + totalDelay);

        return;
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:TestColumnStorageInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {// w  w w  .ja v a  2  s .c om
        if (argv.length != 2) {
            System.out.println("TestColumnStorageInputFormat <input> idx");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestColumnStorageInputFormat.class);

        conf.setJobName("TestColumnStorageInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        conf.set("hive.io.file.readcolumn.ids", argv[1]);

        FormatStorageSerDe serDe = initSerDe(conf);
        StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new ColumnStorageInputFormat();
        long begin = System.currentTimeMillis();
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        long end = System.currentTimeMillis();
        System.out.println("getsplit delay " + (end - begin) + " ms");

        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int size = inputSplits.length;
        System.out.println("getSplits return size:" + size);
        for (int i = 0; i < size; i++) {
            ColumnStorageSplit split = (ColumnStorageSplit) inputSplits[i];
            System.out.printf("split:" + i + " offset:" + split.getStart() + "len:" + split.getLength()
                    + "path:" + split.getPath().toString() + "beginLine:" + split.getBeginLine() + "endLine:"
                    + split.getEndLine());
            if (split.getFileName() != null) {
                System.out.println("fileName:" + split.getFileName());
            } else {
                System.out.println("fileName null");
            }
            if (split.fileList() != null) {
                System.out.println("fileList.num:" + split.fileList().size());
                for (int j = 0; j < split.fileList().size(); j++) {
                    System.out.println("filelist " + j + ":" + split.fileList().get(j));
                }
            }
        }

        while (true) {
            int totalDelay = 0;
            RecordReader<WritableComparable, Writable> currRecReader = null;
            for (int i = 0; i < inputSplits.length; i++) {
                currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

                WritableComparable key;
                Writable value;

                key = currRecReader.createKey();
                value = currRecReader.createValue();

                begin = System.currentTimeMillis();
                int count = 0;
                while (currRecReader.next(key, value)) {

                    Record record = (Record) value;

                    Object row = serDe.deserialize(record);
                    count++;

                }
                end = System.currentTimeMillis();

                long delay = (end - begin) / 1000;
                totalDelay += delay;
                System.out.println(count + " record read over, delay " + delay + " s");
            }

            System.out.println("total delay:" + totalDelay + "\n");
        }

    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:cascading.avro.AvroScheme.java

License:Apache License

/**
 * Source method to take an incoming Avro record and make it a Tuple.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param sourceCall  The cascading SourceCall object. Should be passed in by cascading automatically.
 * @return boolean true on successful parsing and collection, false on failure.
 * @throws IOException//w  w w  . ja  va  2  s  . c  o  m
 */
@Override
public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall)
        throws IOException {

    @SuppressWarnings("unchecked")
    RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput();
    AvroWrapper<IndexedRecord> wrapper = input.createKey();
    if (!input.next(wrapper, input.createValue())) {
        return false;
    }
    IndexedRecord record = wrapper.datum();
    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
    tuple.clear();

    Object[] split = AvroToCascading.parseRecord(record, schema);
    tuple.addAll(split);

    return true;
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * Source method to take an incoming Avro record and make it a Tuple.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param sourceCall  The cascading SourceCall object. Should be passed in by cascading automatically.
 * @return boolean true on successful parsing and collection, false on failure.
 * @throws IOException/*from w ww. ja  v  a 2s.  co m*/
 */
@Override
public boolean source(FlowProcess<? extends Configuration> flowProcess,
        SourceCall<Object[], RecordReader> sourceCall) throws IOException {

    @SuppressWarnings("unchecked")
    RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput();
    AvroWrapper<IndexedRecord> wrapper = input.createKey();
    if (!input.next(wrapper, input.createValue())) {
        return false;
    }
    IndexedRecord record = wrapper.datum();
    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
    tuple.clear();

    Object[] split = AvroToCascading.parseRecord(record, schema);
    tuple.addAll(split);

    return true;
}

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;// w  w w  .  jav a 2  s . c  o m

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}

From source file:com.benchmark.mapred.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample//from   ww  w.  j  ava  2  s  . c om
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}