Example usage for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat.

Prototype

public SequenceFileInputFormat()

Source Link

Usage

From source file:eu.stratosphere.addons.parquet.SequenceFileWordCount.java

License:Apache License

@Override
public Plan getPlan(String... args) {

    int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String dataInput = (args.length > 1 ? args[1] : "");
    String output = (args.length > 2 ? args[2] : "");

    HadoopDataSource source = new HadoopDataSource(new SequenceFileInputFormat(), new JobConf(), "Input Lines");
    SequenceFileInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));

    MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build();
    ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper)
            .name("Count Words").build();
    FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
    CsvOutputFormat.configureRecordFormat(out).recordDelimiter('\n').fieldDelimiter(' ')
            .field(StringValue.class, 0).field(IntValue.class, 1);

    Plan plan = new Plan(out, "WordCount Example with a Sequence File as Input");
    plan.setDefaultParallelism(numSubTasks);
    return plan;//w  w  w. j  av a2 s .  c o m
}

From source file:mlbench.kmeans.KmeansInit.java

License:Apache License

/**
 * get the input values and choose the K clusters' centers
 *
 * @param dataPath// w  w  w .jav  a 2 s. co m
 * @throws MPI_D_Exception
 * @throws IOException
 * @throws MPIException
 */
@SuppressWarnings("deprecation")
private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf)
        throws MPI_D_Exception, IOException, MPIException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);

            Random random = new Random(1000);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            IntWritable cluster = new IntWritable();
            while (reader.next(k, v)) {
                cluster.set(random.nextInt(kCluster));
                MPI_D.Send(cluster, v);
            }
            reader.close();
        }
    } else {
        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        double sum[] = null;
        int count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
                sum = new double[newPoint.get().size()];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[point.get().size()];
                count = 0;
            }
            key = newKey;
            point = newPoint;
            KmeansUtils.accumulate(sum, newPoint.get());
            count++;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }

        System.out.println("rank " + rank + " finish");
    }
    MPI_D.Finalize();
}

From source file:mlbench.kmeans.KmeansIter.java

License:Apache License

/**
 * Calculate the new center iteratively// w  ww . j  a  v a  2 s  .c  o  m
 *
 * @return true: finish; false: continue
 * @throws MPI_D_Exception
 * @throws MPIException
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void iterBody(String args[], HashMap<String, String> conf)
        throws MPI_D_Exception, MPIException, IOException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(centerPath);
            DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config);

            String lineVal;
            try {
                while ((lineVal = in.readLine()) != null) {
                    String lineSeq[] = lineVal.split(":");
                    PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1]));
                    centers.add(p);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        MPI_D.COMM_BIPARTITE_O.Barrier();

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.broadcastCenters(centers);

        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);
        double centerSum[][] = new double[kCluster][];
        long centerPNum[] = new long[kCluster];

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            while (reader.next(k, v)) {
                int centerBelong = (int) getBelongPoint(v);
                //                    int i = (int) p.getStrClusterClass();
                //                    double[] vals = p.getDoubleValue();
                int len = v.get().size();
                if (centerSum[centerBelong] == null) {
                    centerSum[centerBelong] = new double[len];
                }
                for (int j = 0; j < len; j++) {
                    centerSum[centerBelong][j] += v.get().get(j);
                }
                centerPNum[centerBelong]++;
            }
            reader.close();
        }

        for (int i = 0; i < centerPNum.length; i++) {
            if (centerSum[i] == null && centerPNum[i] == 0) {
                continue;
            }
            MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i]));
        }
    } else {
        centers.clear();
        IntWritable key = null, newKey = null;
        KmeansCenters value = null, newValue = null;
        double sum[] = null;
        long count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newValue = (KmeansCenters) vals[1];
            if (key == null && value == null) {
                sum = new double[newValue.getVector().length];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = (double) sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[value.getVector().length];
                count = 0;
            }
            key = newKey;
            value = newValue;
            KmeansUtils.accumulate(sum, newValue.getVector());
            count += Long.valueOf(newValue.getPointSize());
            vals = MPI_D.Recv();
        }
        if (newKey != null && newValue != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }
    }
    MPI_D.Finalize();
}

From source file:org.apache.hawq.pxf.plugins.hdfs.SequenceFileAccessor.java

License:Apache License

/**
 * Constructs a SequenceFileAccessor.//from  w ww.  jav  a2  s  . co  m
 *
 * @param input all input parameters coming from the client request
 */
public SequenceFileAccessor(InputData input) {
    super(input, new SequenceFileInputFormat<Writable, Writable>());
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testSingleSplit() throws Exception {

    Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);/* www.  ja v a  2  s.c o  m*/
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event);
    input.handleEvents(eventList);

    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
        readerCount++;
        while (reader.next()) {
            if (data1.size() == 0) {
                fail("Found more records than expected");
            }
            Object key = reader.getCurrentKey();
            Object val = reader.getCurrentValue();
            assertEquals(val, data1.remove(key));
        }
    }
    assertEquals(1, readerCount);
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testMultipleSplits() throws Exception {

    Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);// w  w w  .  j a va2 s.  com
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 2);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>();

    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10);

    String file2 = "file2";
    LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10, 20);

    data.putAll(data1);
    data.putAll(data2);

    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 2);
    assertEquals(2, splits.length);

    MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto1.toByteString().asReadOnlyByteBuffer());

    MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]);
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto2.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    input.handleEvents(eventList);

    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
        readerCount++;
        while (reader.next()) {
            if (data.size() == 0) {
                fail("Found more records than expected");
            }
            Object key = reader.getCurrentKey();
            Object val = reader.getCurrentValue();
            assertEquals(val, data.remove(key));
        }
    }
    assertEquals(2, readerCount);
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testExtraEvents() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);/*from ww w .  ja  v  a2 s.c  o m*/
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    String file1 = "file1";
    createInputData(localFs, workDir, jobConf, file1, 0, 10);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(1,
            splitProto.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    try {
        input.handleEvents(eventList);
        fail("Expecting Exception due to too many events");
    } catch (Exception e) {
        assertTrue(e.getMessage().contains("Unexpected event. All physical sources already initialized"));
    }
}

From source file:org.apache.tez.mapreduce.processor.MapUtils.java

License:Apache License

private static InputSplit createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file)
        throws IOException {
    FileInputFormat.setInputPaths(job, workDir);

    LOG.info("Generating data at path: " + file);
    // create a file with length entries
    @SuppressWarnings("deprecation")
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class);
    try {/*from w w w .  ja  v a  2 s  .c  om*/
        Random r = new Random(System.currentTimeMillis());
        LongWritable key = new LongWritable();
        Text value = new Text();
        for (int i = 10; i > 0; i--) {
            key.set(r.nextInt(1000));
            value.set(Integer.toString(i));
            writer.append(key, value);
            LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
        }
    } finally {
        writer.close();
    }

    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(job, 1);
    System.err.println("#split = " + splits.length + " ; " + "#locs = " + splits[0].getLocations().length + "; "
            + "loc = " + splits[0].getLocations()[0] + "; " + "off = " + splits[0].getLength() + "; "
            + "file = " + ((FileSplit) splits[0]).getPath());
    return splits[0];
}