Example usage for org.apache.hadoop.fs FileStatus getLen

List of usage examples for org.apache.hadoop.fs FileStatus getLen

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getLen.

Prototype

public long getLen() 

Source Link

Document

Get the length of this file, in bytes.

Usage

From source file:PageInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf job, int num) throws IOException {
    long minSize = 1;
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            FileSystem fs = path.getFileSystem(job);
            blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(path.getFileSystem(job), path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }//from   w  w  w. j a  v a2  s. c  o  m

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
        } else
            splits.add(makeSplit(path, 0, length, new String[0]));
    }
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    return splits.toArray(new InputSplit[0]);

}

From source file:TestFuseDFS.java

License:Apache License

/**
 *
 * Test dfs_read on a file size that will trigger multiple internal reads. 
 * First, just check raw size reading is ok and then check with smaller reads
 * including checking the validity of the data read.
 *
 *//*from   w w w.j a  v a  2  s .  com*/
public void testReads() throws IOException, InterruptedException {
    try {
        // First create a new directory with mkdirs
        Runtime r = Runtime.getRuntime();
        Process p;

        // create the file
        Path myPath = new Path("/test/hello.reads");
        FSDataOutputStream s = fileSys.create(myPath);
        String hello = "hello world!";
        int written = 0;
        int mycount = 0;
        while (written < 1024 * 9) {
            s.writeUTF(hello);
            s.writeInt(mycount++);
            written += hello.length() + 4;
        }
        s.close();

        // check it exists
        assertTrue(fileSys.exists(myPath));
        FileStatus foo = fileSys.getFileStatus(myPath);
        assertTrue(foo.getLen() >= 9 * 1024);

        {
            // cat the file
            DataInputStream is = new DataInputStream(new FileInputStream(mpoint + "/test/hello.reads"));
            byte buf[] = new byte[4096];
            // test reading 0 length
            assertTrue(is.read(buf, 0, 0) == 0);

            // test real reads
            assertTrue(is.read(buf, 0, 1024) == 1024);
            assertTrue(is.read(buf, 0, 4096) == 4096);
            assertTrue(is.read(buf, 0, 4096) == 4096);
            is.close();
        }

        {
            DataInputStream is = new DataInputStream(new FileInputStream(mpoint + "/test/hello.reads"));
            int read = 0;
            int counter = 0;
            try {
                while (true) {
                    String s2 = DataInputStream.readUTF(is);
                    int s3 = is.readInt();
                    assertTrue(s2.equals(hello));
                    assertTrue(s3 == counter++);
                    read += hello.length() + 4;
                }
            } catch (EOFException e) {
                assertTrue(read >= 9 * 1024);
            }
        }

        // check reading an empty file for EOF
        {
            // create the file
            myPath = new Path("/test/hello.reads2");
            s = fileSys.create(myPath);
            s.close();

            FSDataInputStream fs = fileSys.open(myPath);
            assertEquals(-1, fs.read());

            FileInputStream f = new FileInputStream(mpoint + "/test/hello.reads2");
            assertEquals(-1, f.read());
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
    }
}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 * /*from w  w  w .jav  a 2  s .  co  m*/
 * @param job
 *            the job context
 * @throws IOException
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits);
    // InputSplit[] oldSplits = super.getSplits(job, numSplits);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong("mapreduce.input.num.files", files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long minSize = 1;
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    for (FileStatus file : files) {
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(goalSize, minSize, blockSize);
        LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize));
        // System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d",
        // goalSize, splitSize, blockSize));
        for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize))
            splits.add(x);
    }
    System.err.println("splits=" + splits);
    return splits.toArray(new InputSplit[splits.size()]);
}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from  www . jav a2s  .  com*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        InputStream is = null;
        long start = 0;
        long skip = 0;
        if (is != null) {
            // start = is.getAdjustedStart();
            // length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                // SplitCompressionInputStream cin =
                // in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        // Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:DupleInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context//from   w w  w  .  java  2s  . co  m
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    // times that each file exists in the files List
    ArrayList<Integer> times = new ArrayList<Integer>();
    ArrayList<Path> paths = new ArrayList<Path>();

    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

            int index;
            if ((index = paths.indexOf(path)) != -1)
                times.set(index, times.get(index) + 1);
            else {
                times.add(0);
                paths.add(path);
                index = times.size() - 1;
            }

            // not splitable
            splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), times.get(index)));

        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    //LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;//from w  w  w.  ja  v  a 2 s .c o  m
    Path samplesMapPath = null;

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

    /*
     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
     */
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.
    conf.setNumReduceTasks(numSamples);

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    /* 
     * Enable compression of map output.
     *
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     *
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
     */
    conf.setBoolean("mapred.compress.map.output", true);
    //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class);

    conf.setJarByClass(MRDriver.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    // We write the collections found in a reducers as a SequenceFile 
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(PartitionMapper.class);
        break;
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(BinomialSamplerMapper.class);
        break;
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(CoinFlipSamplerMapper.class);
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(InputSamplerMapper.class);

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);
        }

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
            else
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);
        }

        /*
         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
         */
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);
        }

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        map.write(out);
        out.sync();
        out.close();
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
                conf);
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        conf.setInputFormat(WholeSplitInputFormat.class);
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        conf.setMapperClass(RandIntPartSamplerMapper.class);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        conf.setNumMapTasks(mapTasksNum);
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                    System.exit(1);
                }
                if (sum == sampleSize) {
                    break;
                }
            }
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
                }
            }
            Collections.shuffle(Arrays.asList(tempToSampleArr));
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];
            }
        }

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        }
        break;
    default:
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");
        System.exit(1);
        break;
    }

    /*
     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.
     */
    conf.setPartitionerClass(FIMPartitioner.class);

    conf.setReducerClass(FIMReducer.class);

    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    confAggr.setJarByClass(MRDriver.class);

    confAggr.setMapOutputKeyClass(Text.class);
    confAggr.setMapOutputValueClass(DoubleWritable.class);

    confAggr.setOutputKeyClass(Text.class);
    confAggr.setOutputValueClass(Text.class);

    confAggr.setMapperClass(AggregateMapper.class);
    confAggr.setReducerClass(AggregateReducer.class);

    confAggr.setInputFormat(CombineSequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);
    }

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();
    }

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();
    }

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
        }
    }
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
        }
    }
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    }
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        }
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
        }
    }
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
        }
    }
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
        }
    }
    System.out
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    }
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        }
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
        }
    }
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
        }
    }
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
        }
    }
    System.out.println(
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    }
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        }
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
        }
    }
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
        }
    }
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
        }
    }
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    }
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        }
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
        }
    }
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

    System.out.println(
            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

    System.out
            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
    System.out.println(
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;
}

From source file:AggregatedLogsPurger.java

License:Apache License

private long getLengthRecursively(FileSystem fs, Path path) throws IOException {
    long size = 0;
    for (FileStatus status : fs.listStatus(path)) {
        if (status.isDirectory()) {
            getLengthRecursively(fs, status.getPath());
        } else {/*from   w w w .j  av a  2  s  . c om*/
            size += status.getLen();
        }
    }
    return size;
}

From source file:RawParascaleFileSystem.java

License:Apache License

/**
 * {@inheritDoc}//w w w  .j a  v a2s. c o m
 */
@Override
public BlockLocation[] getFileBlockLocations(final FileStatus file, final long start, final long len)
        throws IOException {
    ChunkLocator newChunkLocator = null;
    if (file.getLen() < start + len) {
        throw new IOException("start+len must be less or equal than file length");
    }
    final ArrayList<BlockLocation> locations = new ArrayList<BlockLocation>();
    try {
        newChunkLocator = newChunkLocator();
        final Path makeQualified = file.getPath().makeQualified(this.getUri(), this.getWorkingDirectory());

        // sorted by offset
        final ChunkLocation[] chunkLocations = newChunkLocator.getChunkLocations(pathToFile(makeQualified),
                getVirtualFSFromPath(makeQualified, true));
        long begin = start;
        long length = len;
        for (final ChunkLocation chunkLocation : chunkLocations) {
            final ChunkInfo chunkInfo = chunkLocation.getChunkInfo();
            final StorageNodeInfo[] storageNodeInfo = chunkLocation.getStorageNodeInfo();
            if (length <= 0) {
                // stop when length exceeded
                break;
            }
            if (begin < chunkInfo.getChunkOffset()) {
                // skip if location not reached yet
                continue;
            }
            final List<String> hosts = new ArrayList<String>(0);
            for (int j = 0; j < storageNodeInfo.length; j++) {
                // select all enabled and running nodes
                if (storageNodeInfo[j].isUp() && storageNodeInfo[j].isEnabled()) {
                    hosts.add(storageNodeInfo[j].getNodeName());
                }
            }
            final long lengthInChunk = chunkInfo.getChunkLength() - (begin - chunkInfo.getChunkOffset());
            final BlockLocation blockLocation = new BlockLocation(null, hosts.toArray(new String[0]), begin,
                    lengthInChunk < length ? lengthInChunk : length);
            begin += blockLocation.getLength();
            length -= blockLocation.getLength();
            locations.add(blockLocation);

        }
        if (pLog.isDebugEnabled()) {
            pLog.debug("Fetched " + locations.size() + " chunk locations for " + makeQualified);
        }

        return locations.toArray(new BlockLocation[0]);

    } catch (final ChunkStorageException e) {
        throw new IOException(
                "can not fetch chunk locations " + newChunkLocator == null ? "" : newChunkLocator.toString(),
                e);
    } finally {
        if (newChunkLocator != null) {
            newChunkLocator.close();
        }
    }
}

From source file:a.TestConcatExample.java

License:Apache License

@Test
public void concatIsPermissive() throws IOException, URISyntaxException {
    MiniDFSCluster cluster = null;// www.  j a  v a  2s  . c o  m
    final Configuration conf = WebHdfsTestUtil.createConf();
    conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test
    try {
        cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build();
        cluster.waitActive();
        final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME);
        final FileSystem dfs = cluster.getFileSystem();

        final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks

        Path root = new Path("/dir");
        fs.mkdirs(root);

        short origRep = 3;
        short secondRep = (short) (origRep - 1);
        Path f1 = new Path("/dir/f1");
        long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5);
        long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length;
        assertEquals(5, f1NumBlocks);

        Path f2 = new Path("/dir/f2");
        long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4);
        long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length;
        assertEquals(5, f2NumBlocks);

        fs.concat(f1, new Path[] { f2 });
        FileStatus[] fileStatuses = fs.listStatus(root);

        // Only one file should remain
        assertEquals(1, fileStatuses.length);
        FileStatus fileStatus = fileStatuses[0];

        // And it should be named after the first file
        assertEquals("f1", fileStatus.getPath().getName());

        // The entire file takes the replication of the first argument
        assertEquals(origRep, fileStatus.getReplication());

        // As expected, the new concated file is the length of both the previous files
        assertEquals(size1 + size2, fileStatus.getLen());

        // And we should have the same number of blocks
        assertEquals(f1NumBlocks + f2NumBlocks,
                fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length);
    } finally {
        if (cluster != null) {
            cluster.shutdown();
        }

    }
}

From source file:a.TestConcatExample.java

License:Apache License

private long writeFile(FileSystem fs, Path p, int blockSize, short replication, int numBlocks)
        throws IOException {

    int bufferSize = 4096;
    FSDataOutputStream os = fs.create(p, true, bufferSize, replication, blockSize);

    int i = 0;//from   w w  w .  j  a v a  2s .  c o m

    byte[] data = new byte[bufferSize];
    r.nextBytes(data);
    while (i < blockSize * numBlocks) {
        os.write(data);
        i += data.length;
    }
    os.close();
    FileStatus fileStatus = fs.getFileStatus(p);
    long f1Len = fileStatus.getLen();

    assertEquals(i, f1Len);

    return f1Len;
}