public IntWritable() 

Source Link


From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;//from  w w w  . j  a v a 2 s  .c  o  m

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");


    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");


    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));




    if (useInmapCombiner) {
    } else {

    if (useCombiner) {

    if (useRange) {


    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());

    return mass;

From source file:ac.keio.sslab.nlp.lda.RowIdJob.java

License:Apache License

@Override/*www.j ava 2  s.c om*/
public int run(String[] args) throws Exception {


    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path outputPath = getOutputPath();
    Path indexPath = new Path(outputPath, "docIndex");
    Path matrixPath = new Path(outputPath, "matrix");

    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class,
            SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath,
                    IntWritable.class, VectorWritable.class)) {
        IntWritable docId = new IntWritable();
        int i = 0;
        int numCols = 0;
        for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
            VectorWritable value = record.getSecond();
            indexWriter.append(docId, record.getFirst());
            matrixWriter.append(docId, value);
            numCols = value.get().size();

        log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
        return 0;

From source file:Analysis.A4_High_Traffic_Countries.Top_10_Countries_by_User_Traffic_Reducer.java

protected void cleanup(Context context) throws IOException, InterruptedException {
    for (Map.Entry<Integer, String> entry : top10.entrySet()) {

        IntWritable result = new IntWritable();

        //Integer key = entry.getKey();
        String value = entry.getValue().substring(0, 1).toUpperCase() + entry.getValue().substring(1);

        result.set(entry.getKey());/*from   w  w w . ja v  a2 s. co  m*/

        // print top 10 counntries
        context.write(new Text(value), result);

From source file:ar.edu.ungs.garules.CensusJob.java

License:Apache License

 * Toma la salida del reducer del file system distribuido y la carga en el mapa "ocurrencias" en memoria
 * @param conf/*  w ww .  j av  a 2s  . c o  m*/
 * @param path
 * @throws IOException
private static void llenarOcurrencias(Configuration conf, String path) throws IOException {
    FileSystem fs = new DistributedFileSystem(
            new InetSocketAddress(DEFAULT_FILE_SYSTEM_HOST, DEFAULT_FILE_SYSTEM_PORT), conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(path + "/part-r-00000"), conf);

    Text key = new Text();
    IntWritable value = new IntWritable();
    while (reader.next(key, value))
        ocurrencias.put(key.toString(), value.get());

From source file:at.illecker.hama.hybrid.examples.hellohybrid.HelloHybridBSP.java

License:Apache License

public void bsp(BSPPeer<IntWritable, NullWritable, IntWritable, NullWritable, NullWritable> peer)
        throws IOException, SyncException, InterruptedException {

    BSPJob job = new BSPJob((HamaConfiguration) peer.getConfiguration());
    FileSystem fs = FileSystem.get(peer.getConfiguration());
    FSDataOutputStream outStream = fs/*from www. j  ava 2 s . co  m*/
            .create(new Path(FileOutputFormat.getOutputPath(job), peer.getTaskId() + ".log"));

    outStream.writeChars("HelloHybrid.bsp executed on CPU!\n");

    ArrayList<Integer> summation = new ArrayList<Integer>();

    // test input
    IntWritable key = new IntWritable();
    NullWritable nullValue = NullWritable.get();

    while (peer.readNext(key, nullValue)) {
        outStream.writeChars("input: key: '" + key.get() + "'\n");

    // test sequenceFileReader
    Path example = new Path(peer.getConfiguration().get(CONF_EXAMPLE_PATH));
    SequenceFile.Reader reader = null;
    try {
        reader = new SequenceFile.Reader(fs, example, peer.getConfiguration());

        int i = 0;
        while (reader.next(key, nullValue)) {
            outStream.writeChars("sequenceFileReader: key: '" + key.get() + "'\n");
            if (i < summation.size()) {
                summation.set(i, summation.get(i) + key.get());
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (reader != null) {

    // test output
    for (Integer i : summation) {
        outStream.writeChars("output: key: '" + key.get() + "'\n");
        peer.write(key, nullValue);

    // test getAllPeerNames
    outStream.writeChars("getAllPeerNames: '" + Arrays.toString(peer.getAllPeerNames()) + "'\n");

    // test String.split
    String splitString = "boo:and:foo";
    String[] splits;

    outStream.writeChars("splitString: '" + splitString + "'\n");

    splits = splitString.split(":");
    outStream.writeChars("split(\":\") len: " + splits.length + " values: '" + Arrays.toString(splits) + "'\n");

    splits = splitString.split(":", 2);
            "split(\":\",2) len: " + splits.length + " values: '" + Arrays.toString(splits) + "'\n");

    splits = splitString.split(":", 5);
            "split(\":\",5) len: " + splits.length + " values: '" + Arrays.toString(splits) + "'\n");

    splits = splitString.split(":", -2);
            "split(\":\",-2) len: " + splits.length + " values: '" + Arrays.toString(splits) + "'\n");

    splits = splitString.split(";");
    outStream.writeChars("split(\";\") len: " + splits.length + " values: '" + Arrays.toString(splits) + "'\n");


From source file:at.illecker.hama.hybrid.examples.hellohybrid.HelloHybridBSP.java

License:Apache License

private static void prepareInput(Configuration conf, Path inputPath, Path exampleFile, int n)
        throws IOException {
    FileSystem fs = inputPath.getFileSystem(conf);

    // Create input file writers depending on bspTaskNum
    int bspTaskNum = conf.getInt("bsp.peers.num", 1);
    SequenceFile.Writer[] inputWriters = new SequenceFile.Writer[bspTaskNum];
    for (int i = 0; i < bspTaskNum; i++) {
        Path inputFile = new Path(inputPath, "input" + i + ".seq");
        LOG.info("inputFile: " + inputFile.toString());
        inputWriters[i] = SequenceFile.createWriter(fs, conf, inputFile, IntWritable.class, NullWritable.class,
                CompressionType.NONE);/*from   w  ww .  j ava 2 s.com*/

    // Create example file writer
    SequenceFile.Writer exampleWriter = SequenceFile.createWriter(fs, conf, exampleFile, IntWritable.class,
            NullWritable.class, CompressionType.NONE);

    // Write random values to input files and example
    IntWritable inputKey = new IntWritable();
    NullWritable nullValue = NullWritable.get();
    Random r = new Random();
    for (long i = 0; i < n; i++) {
        for (int j = 0; j < inputWriters.length; j++) {
            inputWriters[j].append(inputKey, nullValue);
        exampleWriter.append(inputKey, nullValue);

    // Close file writers
    for (int j = 0; j < inputWriters.length; j++) {

From source file:at.illecker.hama.hybrid.examples.hellohybrid.HelloHybridBSP.java

License:Apache License

static void printOutput(BSPJob job, Path path) throws IOException {
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    FileStatus[] files = fs.listStatus(path);
    for (int i = 0; i < files.length; i++) {
        if (files[i].getLen() > 0) {
            System.out.println("File " + files[i].getPath());
            SequenceFile.Reader reader = null;
            try {
                reader = new SequenceFile.Reader(fs, files[i].getPath(), job.getConfiguration());

                IntWritable key = new IntWritable();
                NullWritable value = NullWritable.get();
                while (reader.next(key, value)) {
                    System.out.println("key: '" + key.get() + "' value: '" + value + "'\n");
                }/* w ww  .  j  a  v  a  2 s.c  o  m*/
            } catch (IOException e) {
                FSDataInputStream in = fs.open(files[i].getPath());
                IOUtils.copyBytes(in, System.out, job.getConfiguration(), false);
            } finally {
                if (reader != null) {
    // fs.delete(FileOutputFormat.getOutputPath(job), true);

From source file:at.illecker.hama.hybrid.examples.kmeans.KMeansHybridBSP.java

License:Apache License

private void recalculateAssignmentsAndWrite(
        BSPPeer<PipesVectorWritable, NullWritable, IntWritable, PipesVectorWritable, CenterMessage> peer)
        throws IOException {

    IntWritable keyWrite = new IntWritable();
    for (DoubleVector v : m_cache) {
        final int lowestDistantCenter = getNearestCenter(v);
        peer.write(keyWrite, new PipesVectorWritable(v));
    }/* w  w w.  j a v  a  2 s. c  o  m*/

    // just on the first task write the centers to filesystem to prevent
    // collisions
    if (peer.getPeerName().equals(peer.getPeerName(0))) {
        String pathString = m_conf.get(CONF_CENTER_OUT_PATH);
        if (pathString != null) {
            final SequenceFile.Writer dataWriter = SequenceFile.createWriter(FileSystem.get(m_conf), m_conf,
                    new Path(pathString), PipesVectorWritable.class, NullWritable.class, CompressionType.NONE);
            final NullWritable value = NullWritable.get();

            for (DoubleVector center : m_centers_cpu) {
                dataWriter.append(new PipesVectorWritable(center), value);

From source file:at.illecker.hama.hybrid.examples.kmeans.KMeansHybridBSP.java

License:Apache License

public static void main(String[] args) throws Exception {

    // Defaults//w ww.  j a  v  a  2 s  .  c  om
    int numBspTask = 1;
    int numGpuBspTask = 1;
    int blockSize = BLOCK_SIZE;
    int gridSize = GRID_SIZE;
    long n = 10; // input vectors
    int k = 3; // start vectors
    int vectorDimension = 2;
    int maxIteration = 10;
    boolean useTestExampleInput = false;
    boolean isDebugging = false;
    boolean timeMeasurement = false;
    int GPUPercentage = 80;

    Configuration conf = new HamaConfiguration();
    FileSystem fs = FileSystem.get(conf);

    // Set numBspTask to maxTasks
    // BSPJobClient jobClient = new BSPJobClient(conf);
    // ClusterStatus cluster = jobClient.getClusterStatus(true);
    // numBspTask = cluster.getMaxTasks();

    if (args.length > 0) {
        if (args.length == 12) {
            numBspTask = Integer.parseInt(args[0]);
            numGpuBspTask = Integer.parseInt(args[1]);
            blockSize = Integer.parseInt(args[2]);
            gridSize = Integer.parseInt(args[3]);
            n = Long.parseLong(args[4]);
            k = Integer.parseInt(args[5]);
            vectorDimension = Integer.parseInt(args[6]);
            maxIteration = Integer.parseInt(args[7]);
            useTestExampleInput = Boolean.parseBoolean(args[8]);
            GPUPercentage = Integer.parseInt(args[9]);
            isDebugging = Boolean.parseBoolean(args[10]);
            timeMeasurement = Boolean.parseBoolean(args[11]);

        } else {
            System.out.println("Wrong argument size!");
            System.out.println("    Argument1=numBspTask");
            System.out.println("    Argument2=numGpuBspTask");
            System.out.println("    Argument3=blockSize");
            System.out.println("    Argument4=gridSize");
            System.out.println("    Argument5=n | Number of input vectors (" + n + ")");
            System.out.println("    Argument6=k | Number of start vectors (" + k + ")");
                    "    Argument7=vectorDimension | Dimension of each vector (" + vectorDimension + ")");
                    "    Argument8=maxIterations | Number of maximal iterations (" + maxIteration + ")");
            System.out.println("    Argument9=testExample | Use testExample input (true|false=default)");
            System.out.println("    Argument10=GPUPercentage (percentage of input)");
            System.out.println("    Argument11=isDebugging (true|false=defaul)");
            System.out.println("    Argument12=timeMeasurement (true|false=defaul)");

    // Set config variables
    conf.setBoolean(CONF_DEBUG, isDebugging);
    conf.setBoolean("hama.pipes.logging", false);
    conf.setBoolean(CONF_TIME, timeMeasurement);

    // Set CPU tasks
    conf.setInt("bsp.peers.num", numBspTask);
    // Set GPU tasks
    conf.setInt("bsp.peers.gpu.num", numGpuBspTask);
    // Set GPU blockSize and gridSize
    conf.set(CONF_BLOCKSIZE, "" + blockSize);
    conf.set(CONF_GRIDSIZE, "" + gridSize);
    // Set maxIterations for KMeans
    conf.setInt(CONF_MAX_ITERATIONS, maxIteration);
    // Set n for KMeans
    conf.setLong(CONF_N, n);
    // Set GPU workload
    conf.setInt(CONF_GPU_PERCENTAGE, GPUPercentage);

    LOG.info("NumBspTask: " + conf.getInt("bsp.peers.num", 0));
    LOG.info("NumGpuBspTask: " + conf.getInt("bsp.peers.gpu.num", 0));
    LOG.info("bsp.tasks.maximum: " + conf.get("bsp.tasks.maximum"));
    LOG.info("GPUPercentage: " + conf.get(CONF_GPU_PERCENTAGE));
    LOG.info("BlockSize: " + conf.get(CONF_BLOCKSIZE));
    LOG.info("GridSize: " + conf.get(CONF_GRIDSIZE));
    LOG.info("isDebugging: " + conf.get(CONF_DEBUG));
    LOG.info("timeMeasurement: " + conf.get(CONF_TIME));
    LOG.info("useTestExampleInput: " + useTestExampleInput);
    LOG.info("inputPath: " + CONF_INPUT_DIR);
    LOG.info("centersPath: " + CONF_CENTER_DIR);
    LOG.info("outputPath: " + CONF_OUTPUT_DIR);
    LOG.info("n: " + n);
    LOG.info("k: " + k);
    LOG.info("vectorDimension: " + vectorDimension);
    LOG.info("maxIteration: " + maxIteration);

    Path centerIn = new Path(CONF_CENTER_DIR, "center_in.seq");
    Path centerOut = new Path(CONF_CENTER_DIR, "center_out.seq");
    conf.set(CONF_CENTER_IN_PATH, centerIn.toString());
    conf.set(CONF_CENTER_OUT_PATH, centerOut.toString());

    // prepare Input
    if (useTestExampleInput) {
        // prepareTestInput(conf, fs, input, centerIn);
        prepareInputData(conf, fs, CONF_INPUT_DIR, centerIn, numBspTask, numGpuBspTask, n, k, vectorDimension,
                null, GPUPercentage);
    } else {
        prepareInputData(conf, fs, CONF_INPUT_DIR, centerIn, numBspTask, numGpuBspTask, n, k, vectorDimension,
                new Random(3337L), GPUPercentage);

    BSPJob job = createKMeansHybridBSPConf(conf, CONF_INPUT_DIR, CONF_OUTPUT_DIR);

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {
        LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        if (isDebugging) {
            printFile(conf, fs, centerOut, new PipesVectorWritable(), NullWritable.get());
            printOutput(conf, fs, ".log", new IntWritable(), new PipesVectorWritable());

        if (k < 50) {
            printFile(conf, fs, centerOut, new PipesVectorWritable(), NullWritable.get());

From source file:at.illecker.hama.hybrid.examples.matrixmultiplication.MatrixMultiplicationHybridBSP.java

License:Apache License

/********************************* CPU *********************************/
@Override/*from   w  w  w .  j  a v  a  2s .c  o  m*/
public void setup(
        BSPPeer<IntWritable, PipesVectorWritable, IntWritable, PipesVectorWritable, MatrixRowMessage> peer)
        throws IOException {

    HamaConfiguration conf = peer.getConfiguration();
    this.m_isDebuggingEnabled = conf.getBoolean(CONF_DEBUG, false);

    // Choose one as a master, who sorts the matrix rows at the end
    // m_masterTask = peer.getPeerName(peer.getNumPeers() / 2);

    // TODO
    // task must be 0 otherwise write out does NOT work!
    this.m_masterTask = peer.getPeerName(0);

    // Init logging
    if (m_isDebuggingEnabled) {
        try {
            FileSystem fs = FileSystem.get(conf);
            m_logger = fs.create(new Path(FileOutputFormat.getOutputPath(new BSPJob((HamaConfiguration) conf))
                    + "/BSP_" + peer.getTaskId() + ".log"));

        } catch (IOException e) {

    // Load transposed Matrix B
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf),
            new Path(conf.get(CONF_MATRIX_MULT_B_PATH)), conf);

    IntWritable bKey = new IntWritable();
    PipesVectorWritable bVector = new PipesVectorWritable();

    // for each col of matrix B (cause by transposed B)
    while (reader.next(bKey, bVector)) {
        m_bColumns.add(new KeyValuePair<Integer, DoubleVector>(bKey.get(), bVector.getVector()));
        if (m_isDebuggingEnabled) {
            m_logger.writeChars("setup,read,transposedMatrixB,key=" + bKey.get() + ",value="
                    + bVector.getVector().toString() + "\n");