Example usage for org.apache.hadoop.fs FileSystem mkdirs

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem mkdirs.

Prototype

public boolean mkdirs(Path f) throws IOException

Source Link

Document

Call #mkdirs(Path,FsPermission) with default permission.

Usage

From source file:ml.shifu.guagua.example.kmeans.KMeansDataOutput.java

License:Apache License

@Override
public void postApplication(WorkerContext<KMeansMasterParams, KMeansWorkerParams> context) {
    Path outFolder = new Path(context.getProps().getProperty(KMeansContants.KMEANS_DATA_OUTPUT,
            "part-g-" + context.getContainerId()));
    String separator = context.getProps().getProperty(KMeansContants.KMEANS_DATA_SEPERATOR);

    @SuppressWarnings("unchecked")
    MemoryDiskList<TaggedRecord> dataList = (MemoryDiskList<TaggedRecord>) context.getAttachment();

    PrintWriter pw = null;//from  w w w  .j  ava 2 s .co m
    try {
        FileSystem fileSystem = FileSystem.get(new Configuration());
        fileSystem.mkdirs(outFolder);

        Path outputFile = new Path(outFolder, "part-g-" + context.getContainerId());
        FSDataOutputStream fos = fileSystem.create(outputFile);
        LOG.info("Writing results to {}", outputFile.toString());
        pw = new PrintWriter(fos);
        for (TaggedRecord record : dataList) {
            pw.println(record.toString(separator));
        }
        pw.flush();
    } catch (IOException e) {
        LOG.error("Error in writing output.", e);
    } finally {
        IOUtils.closeStream(pw);
    }
}

From source file:ml.shifu.guagua.mapreduce.example.kmeans.KMeansDataOutput.java

License:Apache License

@Override
public void postApplication(WorkerContext<KMeansMasterParams, KMeansWorkerParams> context) {
    Path outFolder = new Path(context.getProps().getProperty(KMeansContants.KMEANS_DATA_OUTPUT,
            "part-g-" + context.getContainerId()));
    String separator = context.getProps().getProperty(KMeansContants.KMEANS_DATA_SEPERATOR);

    @SuppressWarnings("unchecked")
    List<TaggedRecord> dataList = (List<TaggedRecord>) context.getAttachment();

    PrintWriter pw = null;//ww w . ja v  a  2 s  .co  m
    try {
        FileSystem fileSystem = FileSystem.get(new Configuration());
        fileSystem.mkdirs(outFolder);

        Path outputFile = new Path(outFolder, "part-g-" + context.getContainerId());
        FSDataOutputStream fos = fileSystem.create(outputFile);
        LOG.info("Writing results to {}", outputFile.toString());
        pw = new PrintWriter(fos);
        for (TaggedRecord record : dataList) {
            pw.println(record.toString(separator));
        }
        pw.flush();
    } catch (IOException e) {
        LOG.error("Error in writing output.", e);
    } finally {
        IOUtils.closeStream(pw);
    }
}

From source file:ml.shifu.shifu.udf.ColumnProjector.java

License:Apache License

@Override
public void finish() {
    if (modelConfig.isClassification()) {
        return;/*  www  . j  av  a2s .co m*/
    }

    // only for regression, in some cases like gbdt, it's regression score is not in [0,1], to do eval performance,
    // max and min score should be collected to set bounds.
    BufferedWriter writer = null;
    Configuration jobConf = UDFContext.getUDFContext().getJobConf();
    String scoreOutput = jobConf.get(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT);

    log.debug("shifu.eval.maxmin.score.output is {}, job id is {}, task id is {}, attempt id is {}"
            + scoreOutput + " " + jobConf.get("mapreduce.job.id") + " " + jobConf.get("mapreduce.task.id") + " "
            + jobConf.get("mapreduce.task.partition") + " " + jobConf.get("mapreduce.task.attempt.id"));

    try {
        FileSystem fileSystem = FileSystem.get(jobConf);
        fileSystem.mkdirs(new Path(scoreOutput));
        String taskMaxMinScoreFile = scoreOutput + File.separator + "part-"
                + jobConf.get("mapreduce.task.attempt.id");
        writer = ShifuFileUtils.getWriter(taskMaxMinScoreFile, SourceType.HDFS);
        writer.write(maxScore + "," + minScore);
    } catch (IOException e) {
        log.error("error in finish", e);
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException ignore) {
            }
        }
    }
}

From source file:ml.shifu.shifu.udf.EvalScoreUDF.java

License:Apache License

@Override
public void finish() {
    // Since the modelRunner is initialized in execution, if there is no records for this reducer,
    // / the modelRunner may not initialized. It will cause NullPointerException
    if (this.modelRunner != null) {
        this.modelRunner.close();
    }/*w w  w  .  ja v  a  2  s.c o  m*/

    if (modelConfig.isClassification()) {
        return;
    }

    // only for regression, in some cases like gbdt, it's regression score is not in [0,1], to do eval performance,
    // max and min score should be collected to set bounds.
    BufferedWriter writer = null;
    Configuration jobConf = UDFContext.getUDFContext().getJobConf();
    String scoreOutput = jobConf.get(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT);

    log.debug("shifu.eval.maxmin.score.output is {}, job id is {}, task id is {}, attempt id is {}"
            + scoreOutput + " " + jobConf.get("mapreduce.job.id") + " " + jobConf.get("mapreduce.task.id") + " "
            + jobConf.get("mapreduce.task.partition") + " " + jobConf.get("mapreduce.task.attempt.id"));

    try {
        FileSystem fileSystem = FileSystem.get(jobConf);
        fileSystem.mkdirs(new Path(scoreOutput));
        String taskMaxMinScoreFile = scoreOutput + File.separator + "part-"
                + jobConf.get("mapreduce.task.attempt.id");
        writer = ShifuFileUtils.getWriter(taskMaxMinScoreFile, SourceType.HDFS);
        writer.write(maxScore + "," + minScore);
    } catch (IOException e) {
        log.error("error in finish", e);
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException ignore) {
            }
        }
    }
}

From source file:ml.shifu.shifu.util.CommonUtils.java

License:Apache License

/**
 * Sync up all local configuration files to HDFS.
 *
 * @throws IOException          If any exception on HDFS IO or local IO.
 * @throws NullPointerException If parameter {@code modelConfig} is null
 *//*from w w  w.ja  v  a 2 s .  c o m*/
public static boolean copyConfFromLocalToHDFS(ModelConfig modelConfig) throws IOException {
    FileSystem hdfs = HDFSUtils.getFS();
    FileSystem localFs = HDFSUtils.getLocalFS();

    PathFinder pathFinder = new PathFinder(modelConfig);

    Path pathModelSet = new Path(pathFinder.getModelSetPath(SourceType.HDFS));
    // don't check whether pathModelSet is exists, should be remove by user.
    hdfs.mkdirs(pathModelSet);

    // Copy ModelConfig
    Path srcModelConfig = new Path(pathFinder.getModelConfigPath(SourceType.LOCAL));
    Path dstModelConfig = new Path(pathFinder.getModelSetPath(SourceType.HDFS));
    hdfs.copyFromLocalFile(srcModelConfig, dstModelConfig);

    // Copy ColumnConfig
    Path srcColumnConfig = new Path(pathFinder.getColumnConfigPath(SourceType.LOCAL));
    Path dstColumnConfig = new Path(pathFinder.getColumnConfigPath(SourceType.HDFS));
    hdfs.copyFromLocalFile(srcColumnConfig, dstColumnConfig);

    // copy others
    Path srcVersion = new Path(pathFinder.getModelVersion(SourceType.LOCAL));
    if (localFs.exists(srcVersion)) {
        Path dstVersion = new Path(pathFinder.getModelVersion(SourceType.HDFS));
        hdfs.delete(dstVersion, true);
        hdfs.copyFromLocalFile(srcVersion, pathModelSet);
    }

    // Copy Models
    Path srcModels = new Path(pathFinder.getModelsPath(SourceType.LOCAL));
    if (localFs.exists(srcModels)) {
        Path dstModels = new Path(pathFinder.getModelsPath(SourceType.HDFS));
        hdfs.delete(dstModels, true);
        hdfs.copyFromLocalFile(srcModels, pathModelSet);
    }

    // Copy EvalSets
    Path evalsPath = new Path(pathFinder.getEvalsPath(SourceType.LOCAL));
    if (localFs.exists(evalsPath)) {
        for (FileStatus evalset : localFs.listStatus(evalsPath)) {
            EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalset.getPath().getName());
            if (evalConfig != null) {
                copyEvalDataFromLocalToHDFS(modelConfig, evalConfig.getName());
            }
        }
    }

    return true;
}

From source file:msc.fall2015.stock.kmeans.hbase.mapreduce.pwd.PairWiseAlignment.java

License:Open Source License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage:  <sequence_file> <sequence_count> <block_size> <weight>");
        System.exit(2);/*from  ww  w.j ava2s.  c  o m*/
    }

    /* input parameters */
    String sequenceFile = args[1];
    System.out.println(sequenceFile);
    // we are limited to int's as java loops supports only them
    int noOfSequences = Integer.parseInt(args[2]);
    //      int noOfSequences = 7322;
    int blockSize = Integer.parseInt(args[3]);

    boolean weightCalculate = Boolean.parseBoolean(args[4]);
    //      int blockSize = 7322;

    Configuration conf = new Configuration();
    Job job = new Job(conf, "Pairwise-analysis");

    /* create the base dir for this job. Delete and recreates if it exists */
    Path hdMainDir = new Path(msc.fall2015.stock.kmeans.utils.Constants.HDFS_HOME_PATH + "swg-hadoop");
    FileSystem fs = FileSystem.get(conf);
    fs.delete(hdMainDir, true);
    Path hdInputDir = new Path(hdMainDir, "data");
    if (!fs.mkdirs(hdInputDir)) {
        throw new IOException("Mkdirs failed to create" + "/swg-hadoop/data");
    }

    int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize);
    int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2;
    System.out.println("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :"
            + blockSize);

    // Retrieving the configuration form the job to set the properties
    // Setting properties to the original conf does not work (possible
    // Hadoop bug)
    Configuration jobConf = job.getConfiguration();

    // Input dir in HDFS. Create this in newly created job base dir
    Path inputDir = new Path(hdMainDir, "input");
    if (!fs.mkdirs(inputDir)) {
        throw new IOException("Mkdirs failed to create " + inputDir.toString());
    }

    Long dataPartitionStartTime = System.nanoTime();
    partitionData(sequenceFile, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir);

    distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions);

    long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000;
    System.out.println("Data Partition & Scatter Completed in (ms):" + dataPartTime);

    // Output dir in HDFS
    Path hdOutDir = new Path(hdMainDir, "out");

    jobConf.setInt(Constants.BLOCK_SIZE, blockSize);
    jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions);
    jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences);
    jobConf.setBoolean(Constants.WEIGHT_ENABLED, weightCalculate);

    job.setJarByClass(PairWiseAlignment.class);
    job.setMapperClass(SWGMap.class);
    job.setReducerClass(SWGReduce.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(SWGWritable.class);
    FileInputFormat.setInputPaths(job, hdInputDir);
    FileOutputFormat.setOutputPath(job, hdOutDir);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks((int) noOfDivisions);

    long startTime = System.currentTimeMillis();
    int exitStatus = job.waitForCompletion(true) ? 0 : 1;
    double executionTime = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println("Job Finished in " + executionTime + " seconds");

    if (args.length == 5) {
        FileWriter writer = new FileWriter(args[4]);
        writer.write("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput");
        writer.write("\n");
        writer.write(noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFile + "\t"
                + dataPartTime + "\t" + hdMainDir);
        writer.write("\n");
        writer.flush();
        writer.close();
    }
    return exitStatus;
}

From source file:mvm.rya.accumulo.mr.fileinput.BulkNtripsInputTool.java

License:Apache License

@Override
public int run(final String[] args) throws Exception {
    final Configuration conf = getConf();
    try {//from w w  w . j  av a 2 s.com
        //conf
        zk = conf.get(MRUtils.AC_ZK_PROP, zk);
        ttl = conf.get(MRUtils.AC_TTL_PROP, ttl);
        instance = conf.get(MRUtils.AC_INSTANCE_PROP, instance);
        userName = conf.get(MRUtils.AC_USERNAME_PROP, userName);
        pwd = conf.get(MRUtils.AC_PWD_PROP, pwd);
        workDirBase = conf.get(WORKDIR_PROP, workDirBase);
        format = conf.get(MRUtils.FORMAT_PROP, format);
        conf.set(MRUtils.FORMAT_PROP, format);
        final String inputDir = args[0];

        ZooKeeperInstance zooKeeperInstance = new ZooKeeperInstance(instance, zk);
        Connector connector = zooKeeperInstance.getConnector(userName, new PasswordToken(pwd));
        TableOperations tableOperations = connector.tableOperations();

        if (conf.get(AccumuloRdfConfiguration.CONF_ADDITIONAL_INDEXERS) != null) {
            throw new IllegalArgumentException("Cannot use Bulk N Trips tool with Additional Indexers");
        }

        String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, null);
        if (tablePrefix != null)
            RdfCloudTripleStoreConstants.prefixTables(tablePrefix);
        String[] tables = { tablePrefix + RdfCloudTripleStoreConstants.TBL_OSP_SUFFIX,
                tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX,
                tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX };
        Collection<Job> jobs = new ArrayList<Job>();
        for (final String tableName : tables) {
            PrintStream out = null;
            try {
                String workDir = workDirBase + "/" + tableName;
                System.out.println("Loading data into table[" + tableName + "]");

                Job job = new Job(new Configuration(conf),
                        "Bulk Ingest load data to Generic RDF Table[" + tableName + "]");
                job.setJarByClass(this.getClass());
                //setting long job
                Configuration jobConf = job.getConfiguration();
                jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
                jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                jobConf.set("io.sort.mb", jobConf.get("io.sort.mb", "256"));
                jobConf.setBoolean("mapred.compress.map.output", true);
                //                    jobConf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //TODO: I would like LZO compression

                job.setInputFormatClass(TextInputFormat.class);

                job.setMapperClass(ParseNtripsMapper.class);
                job.setMapOutputKeyClass(Key.class);
                job.setMapOutputValueClass(Value.class);

                job.setCombinerClass(OutStmtMutationsReducer.class);
                job.setReducerClass(OutStmtMutationsReducer.class);
                job.setOutputFormatClass(AccumuloFileOutputFormat.class);
                // AccumuloFileOutputFormat.setZooKeeperInstance(jobConf, instance, zk);

                jobConf.set(ParseNtripsMapper.TABLE_PROPERTY, tableName);

                TextInputFormat.setInputPaths(job, new Path(inputDir));

                FileSystem fs = FileSystem.get(conf);
                Path workPath = new Path(workDir);
                if (fs.exists(workPath))
                    fs.delete(workPath, true);

                //make failures dir
                Path failures = new Path(workDir, "failures");
                fs.delete(failures, true);
                fs.mkdirs(new Path(workDir, "failures"));

                AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files"));

                out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt"))));

                if (!tableOperations.exists(tableName))
                    tableOperations.create(tableName);
                Collection<Text> splits = tableOperations.getSplits(tableName, Integer.MAX_VALUE);
                for (Text split : splits)
                    out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split))));

                job.setNumReduceTasks(splits.size() + 1);
                out.close();

                job.setPartitionerClass(KeyRangePartitioner.class);
                RangePartitioner.setSplitFile(job, workDir + "/splits.txt");

                jobConf.set(WORKDIR_PROP, workDir);

                job.submit();
                jobs.add(job);

            } catch (Exception re) {
                throw new RuntimeException(re);
            } finally {
                if (out != null)
                    out.close();
            }
        }

        for (Job job : jobs) {
            while (!job.isComplete()) {
                Thread.sleep(1000);
            }
        }

        for (String tableName : tables) {
            String workDir = workDirBase + "/" + tableName;
            String filesDir = workDir + "/files";
            String failuresDir = workDir + "/failures";

            FileSystem fs = FileSystem.get(conf);

            //make sure that the "accumulo" user can read/write/execute into these directories this path
            fs.setPermission(new Path(filesDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
            fs.setPermission(new Path(failuresDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));

            tableOperations.importDirectory(tableName, filesDir, failuresDir, false);

        }

    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    return 0;
}

From source file:net.peacesoft.nutch.crawl.ReCrawl.java

License:Apache License

public Path getPathUrls(Path dir, List<String> seeds, String urls) throws Exception {
    String tmpSeedDir = urls + "/seed-" + System.currentTimeMillis();
    FileSystem fs = FileSystem.get(getConf());
    Path p = new Path(dir, tmpSeedDir);
    fs.mkdirs(p);
    Path seedOut = new Path(p, urls);
    OutputStream os = fs.create(seedOut);
    for (String s : seeds) {
        os.write(s.getBytes());/*from w  w  w.ja va  2  s  . c  o  m*/
        os.write('\n');
    }
    os.flush();
    os.close();
    return p;
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public static void install(JobConf job, Path crawlDb) throws IOException {
    boolean preserveBackup = job.getBoolean("db.preserve.backup", true);

    Path newCrawlDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(crawlDb, "old");
    Path current = new Path(crawlDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old)) {
            fs.delete(old, true);/* w  w  w. java 2 s .c om*/
        }
        fs.rename(current, old);
    }
    fs.mkdirs(crawlDb);
    fs.rename(newCrawlDb, current);
    if (!preserveBackup && fs.exists(old)) {
        fs.delete(old, true);
    }
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.removeLockFile(fs, lock);
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

public static void install(JobConf job, Path linkDb) throws IOException {
    Path newLinkDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(linkDb, "old");
    Path current = new Path(linkDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old)) {
            fs.delete(old, true);/* w  ww  . ja  v a2 s.  c  o  m*/
        }
        fs.rename(current, old);
    }
    fs.mkdirs(linkDb);
    fs.rename(newLinkDb, current);
    if (fs.exists(old)) {
        fs.delete(old, true);
    }
    LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}