Example usage for org.apache.hadoop.fs FileSystem create

List of usage examples for org.apache.hadoop.fs FileSystem create


In this page you can find the example usage for org.apache.hadoop.fs FileSystem create.


public FSDataOutputStream create(Path f, short replication) throws IOException 

Source Link


Create an FSDataOutputStream at the indicated Path.


From source file:WikipediaForwardIndexBuilder.java

License:Apache License

@Override/*w  w w  .ja v  a2s.  c  o  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        return -1;

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));


    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);



    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);


    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);



        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");


    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;

From source file:ReadAllTest.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.out.println("ReadAllTest: must supply the HDFS uri and file to read");
        System.exit(1);/* www  .ja va 2  s. c o  m*/
    String hdfsUri = args[0];
    String fileName = args[1];
    final Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(new URI(hdfsUri), conf);

    byte ORIGINAL[] = new byte[10];
    for (int i = 0; i < ORIGINAL.length; i++) {
        ORIGINAL[i] = (byte) i;
    FSDataOutputStream out = fs.create(new Path(fileName), (short) 1);
    try {
    } finally {
    byte input[] = new byte[ORIGINAL.length];
    FSDataInputStream in = fs.open(new Path(fileName));
    try {
    } finally {
    in = fs.open(new Path(fileName));
    try {
        in.readFully(0, input);
    } finally {

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;
    Path samplesMapPath = null;//from   ww w  . ja  v  a  2s.co  m

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

     * Enable compression of map output.
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
    conf.setBoolean("mapred.compress.map.output", true);




    // We write the collections found in a reducers as a SequenceFile 
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);

         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                if (sum == sampleSize) {
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");

     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.


    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);





    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;

From source file:BwaInterpreter.java

License:Open Source License

private void combineOutputSamFiles(String outputHdfsDir, List<String> returnedValues) {
    try {//w ww .j  a v  a2s.c  om
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        Path finalHdfsOutputFile = new Path(outputHdfsDir + "/FullOutput.sam");
        FSDataOutputStream outputFinalStream = fs.create(finalHdfsOutputFile, true);

        // We iterate over the resulting files in HDFS and agregate them into only one file.
        for (int i = 0; i < returnedValues.size(); i++) {
            LOG.info("JMAbuin:: SparkBWA :: Returned file ::" + returnedValues.get(i));
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(fs.open(new Path(returnedValues.get(i)))));

            String line;
            line = br.readLine();

            while (line != null) {
                if (i == 0 || !line.startsWith("@")) {
                    outputFinalStream.write((line + "\n").getBytes());

                line = br.readLine();

            fs.delete(new Path(returnedValues.get(i)), true);

    } catch (IOException e) {

From source file:BwaInterpreter.java

License:Open Source License

 * Used to perform the sort operation in HDFS
 * @brief This function provides a method to perform the sort phase in HDFS
 * @author Jos M. Abun//  w w w.j a v a 2  s  .c om
 * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS
 * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS
 * @return A JavaRDD that contains the paired reads sorted
public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) {

    Configuration conf = this.conf;

    LOG.info("JMAbuin:: Starting writing reads to HDFS");

    try {
        FileSystem fs = FileSystem.get(conf);

        Path outputFilePath = new Path(this.inputTmpFileName);

        //To write the paired reads
        FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true);

        //To read paired reads from both files
        BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1))));
        BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2))));

        String lineFastq1;
        String lineFastq2;

        lineFastq1 = brFastqFile1.readLine();
        lineFastq2 = brFastqFile2.readLine();

        //Loop to read two files. The two of them must have the same line numbers
        while (lineFastq1 != null) {
            //The lines are written interspersed
            outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes());

            //Next lines are readed
            lineFastq1 = brFastqFile1.readLine();
            lineFastq2 = brFastqFile2.readLine();

        //Close the input and output files

        //Now it is time to read the previous created file and create the RDD
        ContentSummary cSummary = fs.getContentSummary(outputFilePath);

        long length = cSummary.getLength();

        this.totalInputLength = length;


        //In case of the user does want partitioning
        if (this.options.getPartitionNumber() != 0) {

            //These options are set to indicate the split size and get the correct vnumber of partitions
                    String.valueOf((length) / this.options.getPartitionNumber()));
                    String.valueOf((length) / this.options.getPartitionNumber()));

            LOG.info("JMAbuin partitioning from HDFS:: "
                    + String.valueOf((length) / this.options.getPartitionNumber()));

            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true);

        } else {
            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).map(new BigFastq2RDDDouble());

    } catch (IOException e) {
        // TODO Auto-generated catch block

        return null;

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//from w w w . ja  v  a  2s. c o  m
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        conn = DriverManager.getConnection("jdbc:postgresql://", "postgres",
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {


            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());

From source file:batch.BatchScan2Html.java

License:Apache License

public static void writeAccumuloTableToHdfsAsHtml() throws IOException, URISyntaxException {
    Configuration configuration = new Configuration();
    //TODO add options for URI and output Path
    FileSystem hdfs = FileSystem.get(new URI("hdfs://n001:54310"), configuration);
    Path file = new Path("hdfs://n001:54310/s2013/batch/table.html");
    //TODO add option to override file default: true
    if (hdfs.exists(file)) {
        hdfs.delete(file, true);//from   w  ww  .j a  va2  s.  c o m
    startTime = System.currentTimeMillis();
    OutputStream os = hdfs.create(file, new Progressable() {
        public void progress() {
            // TODO add a better progress descriptor
            crudeRunTime = System.currentTimeMillis() - startTime;
            out.println("...bytes written: [ " + bytesWritten + " ]");
            out.println("...bytes / second: [ " + (bytesWritten / crudeRunTime) * 1000 + " ]");
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
    //  TODO add option for table id { example }
    writeHtmlTableHeader(br, "example", new ArrayList<String>(Arrays.asList("Row ID", "Column Family",
            "Column Qualifier", "Column Visibility", "Timestamp", "Value")));
    out.println("Total bytes written: " + bytesWritten);
    out.println("Total crude time: " + crudeRunTime / 1000);

From source file:be.ugent.intec.halvade.uploader.mapreduce.MyFastqOutputFormat.java

public RecordWriter<PairedIdWritable, FastqRecord> getRecordWriter(TaskAttemptContext task) throws IOException {
    Configuration conf = task.getConfiguration();
    boolean isCompressed = getCompressOutput(task);

    CompressionCodec codec = null;/* w  w w.j av  a  2  s.c o m*/
    String extension = "";

    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(task, GzipCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();

    Path file = getDefaultWorkFile(task, extension);
    FileSystem fs = file.getFileSystem(conf);

    OutputStream output;

    if (isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        output = new DataOutputStream(codec.createOutputStream(fileOut));
    } else {
        output = fs.create(file, false);

    return new FastqRecordWriter(conf, output);

From source file:br.com.lassal.nqueens.grid.job.GenerateSolutions.java

 * NQueens working folder structure /nqueens/board-{x}/partial/solution_X-4
 * @param queensSize/*from w  w  w .  j  a  va2  s.  c o m*/
 * @throws IOException
private void setWorkingFolder(int queensSize, Job job) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    if (fs.isDirectory(new Path("/nqueens/board-" + queensSize + "/final"))) {
        System.exit(0); // ja foi processado anteriormente nao processa de novo

    String lastSolution = null;
    Path partialSolDir = new Path("/nqueens/board-" + queensSize + "/partial/");
    Path inputPath = null;
    Path outputPath = null;

    if (fs.exists(partialSolDir)) {
        RemoteIterator<LocatedFileStatus> dirsFound = fs.listLocatedStatus(partialSolDir);

        while (dirsFound.hasNext()) {
            LocatedFileStatus path = dirsFound.next();
            if (lastSolution == null) {
                lastSolution = path.getPath().getName();
                inputPath = path.getPath();
            } else {
                String currentDir = path.getPath().getName();
                if (lastSolution.compareToIgnoreCase(currentDir) < 0) {
                    lastSolution = currentDir;
                    inputPath = path.getPath();
    int currentSolutionSet = 0;
    if (inputPath == null) {
        inputPath = new Path("/nqueens/board-" + queensSize + "/seed");
        if (!fs.exists(inputPath)) {
            FSDataOutputStream seedFile = fs.create(inputPath, true);
            seedFile.writeBytes(queensSize + "#");
    // Input
    FileInputFormat.addInputPath(job, inputPath);

    if (lastSolution != null) {
        String[] solution = lastSolution.split("-");
        if (solution[0].equalsIgnoreCase("solution_" + queensSize)) {
            currentSolutionSet = Integer.parseInt(solution[1]) + 4;

            if (currentSolutionSet >= queensSize) {
                outputPath = new Path("/nqueens/board-" + queensSize + "/final");
            } else {
                outputPath = new Path("/nqueens/board-" + queensSize + "/partial/solution_" + queensSize + "-"
                        + currentSolutionSet);
    } else {
        outputPath = new Path("/nqueens/board-" + queensSize + "/partial/solution_" + queensSize + "-4");

    // Output
    FileOutputFormat.setOutputPath(job, outputPath);


From source file:br.com.lassal.nqueens.grid.job.NQueenCounter.java

private Path setWorkingFolder(int queensSize, String workingFolder, boolean isFinal, Job job)
        throws IOException {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path returnPath = null;/*from   w w  w  . j a  va  2 s .c om*/

    if (workingFolder == null) {
        workingFolder = "";

    Path partialSolDir = new Path(workingFolder + "/nqueens/board-" + queensSize + "/partial/");
    Path inputPath = null;
    Path outputPath = null;
    String nextRunPath = "run_1";

    if (fs.exists(partialSolDir)) {
        RemoteIterator<LocatedFileStatus> dirsFound = fs.listLocatedStatus(partialSolDir);
        String lastRunPath = null;
        Path lastPath = null;

        while (dirsFound.hasNext()) {
            LocatedFileStatus dir = dirsFound.next();

            if (dir.isDirectory()) {
                if (lastRunPath == null || dir.getPath().getName().compareTo(lastRunPath) > 0) {
                    lastPath = dir.getPath();
                    lastRunPath = lastPath.getName();
        if (lastRunPath != null) {
            String[] runParts = lastRunPath.split("_");
            int lastRun = Integer.parseInt(runParts[1]);
            nextRunPath = runParts[0] + "_" + (++lastRun);
            inputPath = lastPath;

    if (inputPath == null) {
        inputPath = new Path(workingFolder + "/nqueens/board-" + queensSize + "/seed");
        if (!fs.exists(inputPath)) {
            FSDataOutputStream seedFile = fs.create(inputPath, true);
            seedFile.writeBytes(queensSize + ":");
    } else {
        returnPath = inputPath;
    // Input
    FileInputFormat.addInputPath(job, inputPath);

    if (isFinal) {
        outputPath = new Path(workingFolder + "/nqueens/board-" + queensSize + "/final");
    } else {
        outputPath = new Path(workingFolder + "/nqueens/board-" + queensSize + "/partial/" + nextRunPath);

    // Output
    FileOutputFormat.setOutputPath(job, outputPath);

    return returnPath;