List of usage examples for org.apache.hadoop.fs FileSystem mkdirs
public boolean mkdirs(Path f) throws IOException
From source file:ml.shifu.guagua.example.kmeans.KMeansDataOutput.java
License:Apache License
@Override public void postApplication(WorkerContext<KMeansMasterParams, KMeansWorkerParams> context) { Path outFolder = new Path(context.getProps().getProperty(KMeansContants.KMEANS_DATA_OUTPUT, "part-g-" + context.getContainerId())); String separator = context.getProps().getProperty(KMeansContants.KMEANS_DATA_SEPERATOR); @SuppressWarnings("unchecked") MemoryDiskList<TaggedRecord> dataList = (MemoryDiskList<TaggedRecord>) context.getAttachment(); PrintWriter pw = null;//from w w w .j ava 2 s .co m try { FileSystem fileSystem = FileSystem.get(new Configuration()); fileSystem.mkdirs(outFolder); Path outputFile = new Path(outFolder, "part-g-" + context.getContainerId()); FSDataOutputStream fos = fileSystem.create(outputFile); LOG.info("Writing results to {}", outputFile.toString()); pw = new PrintWriter(fos); for (TaggedRecord record : dataList) { pw.println(record.toString(separator)); } pw.flush(); } catch (IOException e) { LOG.error("Error in writing output.", e); } finally { IOUtils.closeStream(pw); } }
From source file:ml.shifu.guagua.mapreduce.example.kmeans.KMeansDataOutput.java
License:Apache License
@Override public void postApplication(WorkerContext<KMeansMasterParams, KMeansWorkerParams> context) { Path outFolder = new Path(context.getProps().getProperty(KMeansContants.KMEANS_DATA_OUTPUT, "part-g-" + context.getContainerId())); String separator = context.getProps().getProperty(KMeansContants.KMEANS_DATA_SEPERATOR); @SuppressWarnings("unchecked") List<TaggedRecord> dataList = (List<TaggedRecord>) context.getAttachment(); PrintWriter pw = null;//ww w . ja v a 2 s .co m try { FileSystem fileSystem = FileSystem.get(new Configuration()); fileSystem.mkdirs(outFolder); Path outputFile = new Path(outFolder, "part-g-" + context.getContainerId()); FSDataOutputStream fos = fileSystem.create(outputFile); LOG.info("Writing results to {}", outputFile.toString()); pw = new PrintWriter(fos); for (TaggedRecord record : dataList) { pw.println(record.toString(separator)); } pw.flush(); } catch (IOException e) { LOG.error("Error in writing output.", e); } finally { IOUtils.closeStream(pw); } }
From source file:ml.shifu.shifu.udf.ColumnProjector.java
License:Apache License
@Override public void finish() { if (modelConfig.isClassification()) { return;/* www . j av a2s .co m*/ } // only for regression, in some cases like gbdt, it's regression score is not in [0,1], to do eval performance, // max and min score should be collected to set bounds. BufferedWriter writer = null; Configuration jobConf = UDFContext.getUDFContext().getJobConf(); String scoreOutput = jobConf.get(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT); log.debug("shifu.eval.maxmin.score.output is {}, job id is {}, task id is {}, attempt id is {}" + scoreOutput + " " + jobConf.get("mapreduce.job.id") + " " + jobConf.get("mapreduce.task.id") + " " + jobConf.get("mapreduce.task.partition") + " " + jobConf.get("mapreduce.task.attempt.id")); try { FileSystem fileSystem = FileSystem.get(jobConf); fileSystem.mkdirs(new Path(scoreOutput)); String taskMaxMinScoreFile = scoreOutput + File.separator + "part-" + jobConf.get("mapreduce.task.attempt.id"); writer = ShifuFileUtils.getWriter(taskMaxMinScoreFile, SourceType.HDFS); writer.write(maxScore + "," + minScore); } catch (IOException e) { log.error("error in finish", e); } finally { if (writer != null) { try { writer.close(); } catch (IOException ignore) { } } } }
From source file:ml.shifu.shifu.udf.EvalScoreUDF.java
License:Apache License
@Override public void finish() { // Since the modelRunner is initialized in execution, if there is no records for this reducer, // / the modelRunner may not initialized. It will cause NullPointerException if (this.modelRunner != null) { this.modelRunner.close(); }/*w w w . ja v a 2 s.c o m*/ if (modelConfig.isClassification()) { return; } // only for regression, in some cases like gbdt, it's regression score is not in [0,1], to do eval performance, // max and min score should be collected to set bounds. BufferedWriter writer = null; Configuration jobConf = UDFContext.getUDFContext().getJobConf(); String scoreOutput = jobConf.get(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT); log.debug("shifu.eval.maxmin.score.output is {}, job id is {}, task id is {}, attempt id is {}" + scoreOutput + " " + jobConf.get("mapreduce.job.id") + " " + jobConf.get("mapreduce.task.id") + " " + jobConf.get("mapreduce.task.partition") + " " + jobConf.get("mapreduce.task.attempt.id")); try { FileSystem fileSystem = FileSystem.get(jobConf); fileSystem.mkdirs(new Path(scoreOutput)); String taskMaxMinScoreFile = scoreOutput + File.separator + "part-" + jobConf.get("mapreduce.task.attempt.id"); writer = ShifuFileUtils.getWriter(taskMaxMinScoreFile, SourceType.HDFS); writer.write(maxScore + "," + minScore); } catch (IOException e) { log.error("error in finish", e); } finally { if (writer != null) { try { writer.close(); } catch (IOException ignore) { } } } }
From source file:ml.shifu.shifu.util.CommonUtils.java
License:Apache License
/** * Sync up all local configuration files to HDFS. * * @throws IOException If any exception on HDFS IO or local IO. * @throws NullPointerException If parameter {@code modelConfig} is null *//*from w w w.ja v a 2 s . c o m*/ public static boolean copyConfFromLocalToHDFS(ModelConfig modelConfig) throws IOException { FileSystem hdfs = HDFSUtils.getFS(); FileSystem localFs = HDFSUtils.getLocalFS(); PathFinder pathFinder = new PathFinder(modelConfig); Path pathModelSet = new Path(pathFinder.getModelSetPath(SourceType.HDFS)); // don't check whether pathModelSet is exists, should be remove by user. hdfs.mkdirs(pathModelSet); // Copy ModelConfig Path srcModelConfig = new Path(pathFinder.getModelConfigPath(SourceType.LOCAL)); Path dstModelConfig = new Path(pathFinder.getModelSetPath(SourceType.HDFS)); hdfs.copyFromLocalFile(srcModelConfig, dstModelConfig); // Copy ColumnConfig Path srcColumnConfig = new Path(pathFinder.getColumnConfigPath(SourceType.LOCAL)); Path dstColumnConfig = new Path(pathFinder.getColumnConfigPath(SourceType.HDFS)); hdfs.copyFromLocalFile(srcColumnConfig, dstColumnConfig); // copy others Path srcVersion = new Path(pathFinder.getModelVersion(SourceType.LOCAL)); if (localFs.exists(srcVersion)) { Path dstVersion = new Path(pathFinder.getModelVersion(SourceType.HDFS)); hdfs.delete(dstVersion, true); hdfs.copyFromLocalFile(srcVersion, pathModelSet); } // Copy Models Path srcModels = new Path(pathFinder.getModelsPath(SourceType.LOCAL)); if (localFs.exists(srcModels)) { Path dstModels = new Path(pathFinder.getModelsPath(SourceType.HDFS)); hdfs.delete(dstModels, true); hdfs.copyFromLocalFile(srcModels, pathModelSet); } // Copy EvalSets Path evalsPath = new Path(pathFinder.getEvalsPath(SourceType.LOCAL)); if (localFs.exists(evalsPath)) { for (FileStatus evalset : localFs.listStatus(evalsPath)) { EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalset.getPath().getName()); if (evalConfig != null) { copyEvalDataFromLocalToHDFS(modelConfig, evalConfig.getName()); } } } return true; }
From source file:msc.fall2015.stock.kmeans.hbase.mapreduce.pwd.PairWiseAlignment.java
License:Open Source License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: <sequence_file> <sequence_count> <block_size> <weight>"); System.exit(2);/*from ww w.j ava2s. c o m*/ } /* input parameters */ String sequenceFile = args[1]; System.out.println(sequenceFile); // we are limited to int's as java loops supports only them int noOfSequences = Integer.parseInt(args[2]); // int noOfSequences = 7322; int blockSize = Integer.parseInt(args[3]); boolean weightCalculate = Boolean.parseBoolean(args[4]); // int blockSize = 7322; Configuration conf = new Configuration(); Job job = new Job(conf, "Pairwise-analysis"); /* create the base dir for this job. Delete and recreates if it exists */ Path hdMainDir = new Path(msc.fall2015.stock.kmeans.utils.Constants.HDFS_HOME_PATH + "swg-hadoop"); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdInputDir = new Path(hdMainDir, "data"); if (!fs.mkdirs(hdInputDir)) { throw new IOException("Mkdirs failed to create" + "/swg-hadoop/data"); } int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize); int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2; System.out.println("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :" + blockSize); // Retrieving the configuration form the job to set the properties // Setting properties to the original conf does not work (possible // Hadoop bug) Configuration jobConf = job.getConfiguration(); // Input dir in HDFS. Create this in newly created job base dir Path inputDir = new Path(hdMainDir, "input"); if (!fs.mkdirs(inputDir)) { throw new IOException("Mkdirs failed to create " + inputDir.toString()); } Long dataPartitionStartTime = System.nanoTime(); partitionData(sequenceFile, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir); distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions); long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000; System.out.println("Data Partition & Scatter Completed in (ms):" + dataPartTime); // Output dir in HDFS Path hdOutDir = new Path(hdMainDir, "out"); jobConf.setInt(Constants.BLOCK_SIZE, blockSize); jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions); jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences); jobConf.setBoolean(Constants.WEIGHT_ENABLED, weightCalculate); job.setJarByClass(PairWiseAlignment.class); job.setMapperClass(SWGMap.class); job.setReducerClass(SWGReduce.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SWGWritable.class); FileInputFormat.setInputPaths(job, hdInputDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks((int) noOfDivisions); long startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; double executionTime = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + executionTime + " seconds"); if (args.length == 5) { FileWriter writer = new FileWriter(args[4]); writer.write("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput"); writer.write("\n"); writer.write(noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFile + "\t" + dataPartTime + "\t" + hdMainDir); writer.write("\n"); writer.flush(); writer.close(); } return exitStatus; }
From source file:mvm.rya.accumulo.mr.fileinput.BulkNtripsInputTool.java
License:Apache License
@Override public int run(final String[] args) throws Exception { final Configuration conf = getConf(); try {//from w w w . j av a 2 s.com //conf zk = conf.get(MRUtils.AC_ZK_PROP, zk); ttl = conf.get(MRUtils.AC_TTL_PROP, ttl); instance = conf.get(MRUtils.AC_INSTANCE_PROP, instance); userName = conf.get(MRUtils.AC_USERNAME_PROP, userName); pwd = conf.get(MRUtils.AC_PWD_PROP, pwd); workDirBase = conf.get(WORKDIR_PROP, workDirBase); format = conf.get(MRUtils.FORMAT_PROP, format); conf.set(MRUtils.FORMAT_PROP, format); final String inputDir = args[0]; ZooKeeperInstance zooKeeperInstance = new ZooKeeperInstance(instance, zk); Connector connector = zooKeeperInstance.getConnector(userName, new PasswordToken(pwd)); TableOperations tableOperations = connector.tableOperations(); if (conf.get(AccumuloRdfConfiguration.CONF_ADDITIONAL_INDEXERS) != null) { throw new IllegalArgumentException("Cannot use Bulk N Trips tool with Additional Indexers"); } String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, null); if (tablePrefix != null) RdfCloudTripleStoreConstants.prefixTables(tablePrefix); String[] tables = { tablePrefix + RdfCloudTripleStoreConstants.TBL_OSP_SUFFIX, tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX, tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX }; Collection<Job> jobs = new ArrayList<Job>(); for (final String tableName : tables) { PrintStream out = null; try { String workDir = workDirBase + "/" + tableName; System.out.println("Loading data into table[" + tableName + "]"); Job job = new Job(new Configuration(conf), "Bulk Ingest load data to Generic RDF Table[" + tableName + "]"); job.setJarByClass(this.getClass()); //setting long job Configuration jobConf = job.getConfiguration(); jobConf.setBoolean("mapred.map.tasks.speculative.execution", false); jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false); jobConf.set("io.sort.mb", jobConf.get("io.sort.mb", "256")); jobConf.setBoolean("mapred.compress.map.output", true); // jobConf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //TODO: I would like LZO compression job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParseNtripsMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setCombinerClass(OutStmtMutationsReducer.class); job.setReducerClass(OutStmtMutationsReducer.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); // AccumuloFileOutputFormat.setZooKeeperInstance(jobConf, instance, zk); jobConf.set(ParseNtripsMapper.TABLE_PROPERTY, tableName); TextInputFormat.setInputPaths(job, new Path(inputDir)); FileSystem fs = FileSystem.get(conf); Path workPath = new Path(workDir); if (fs.exists(workPath)) fs.delete(workPath, true); //make failures dir Path failures = new Path(workDir, "failures"); fs.delete(failures, true); fs.mkdirs(new Path(workDir, "failures")); AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files")); out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt")))); if (!tableOperations.exists(tableName)) tableOperations.create(tableName); Collection<Text> splits = tableOperations.getSplits(tableName, Integer.MAX_VALUE); for (Text split : splits) out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split)))); job.setNumReduceTasks(splits.size() + 1); out.close(); job.setPartitionerClass(KeyRangePartitioner.class); RangePartitioner.setSplitFile(job, workDir + "/splits.txt"); jobConf.set(WORKDIR_PROP, workDir); job.submit(); jobs.add(job); } catch (Exception re) { throw new RuntimeException(re); } finally { if (out != null) out.close(); } } for (Job job : jobs) { while (!job.isComplete()) { Thread.sleep(1000); } } for (String tableName : tables) { String workDir = workDirBase + "/" + tableName; String filesDir = workDir + "/files"; String failuresDir = workDir + "/failures"; FileSystem fs = FileSystem.get(conf); //make sure that the "accumulo" user can read/write/execute into these directories this path fs.setPermission(new Path(filesDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); fs.setPermission(new Path(failuresDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); tableOperations.importDirectory(tableName, filesDir, failuresDir, false); } } catch (Exception e) { throw new RuntimeException(e); } return 0; }
From source file:net.peacesoft.nutch.crawl.ReCrawl.java
License:Apache License
public Path getPathUrls(Path dir, List<String> seeds, String urls) throws Exception { String tmpSeedDir = urls + "/seed-" + System.currentTimeMillis(); FileSystem fs = FileSystem.get(getConf()); Path p = new Path(dir, tmpSeedDir); fs.mkdirs(p); Path seedOut = new Path(p, urls); OutputStream os = fs.create(seedOut); for (String s : seeds) { os.write(s.getBytes());/*from w w w.ja va 2 s . c o m*/ os.write('\n'); } os.flush(); os.close(); return p; }
From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java
License:Apache License
public static void install(JobConf job, Path crawlDb) throws IOException { boolean preserveBackup = job.getBoolean("db.preserve.backup", true); Path newCrawlDb = FileOutputFormat.getOutputPath(job); FileSystem fs = new JobClient(job).getFs(); Path old = new Path(crawlDb, "old"); Path current = new Path(crawlDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) { fs.delete(old, true);/* w w w. java 2 s .c om*/ } fs.rename(current, old); } fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); if (!preserveBackup && fs.exists(old)) { fs.delete(old, true); } Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.removeLockFile(fs, lock); }
From source file:net.peacesoft.nutch.crawl.ReLinkDb.java
License:Apache License
public static void install(JobConf job, Path linkDb) throws IOException { Path newLinkDb = FileOutputFormat.getOutputPath(job); FileSystem fs = new JobClient(job).getFs(); Path old = new Path(linkDb, "old"); Path current = new Path(linkDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) { fs.delete(old, true);/* w ww . ja v a2 s. c o m*/ } fs.rename(current, old); } fs.mkdirs(linkDb); fs.rename(newLinkDb, current); if (fs.exists(old)) { fs.delete(old, true); } LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME)); }