List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:com.alexholmes.hadooputils.sort.SortInputSampler.java
License:Apache License
public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler) throws IOException { Configuration conf = job;//from w ww .j a va 2s. c o m // Use the input format defined in the job. NOT, the one provided by // the parent class's writePartitionFile() method, which will be a plain // TextInputFormat, by default final InputFormat inf = job.getInputFormat(); int numPartitions = job.getNumReduceTasks(); K[] samples = (K[]) sampler.getSample(inf, job); RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:com.alexholmes.hdfsslurper.Configurator.java
License:Apache License
public static void testCreateDir(Path p, Configuration conf) throws IOException, ConfigSettingException, FileSystemMkdirFailed { FileSystem fs = p.getFileSystem(conf); if (fs.exists(p) && !fs.getFileStatus(p).isDir()) { throw new ConfigSettingException("Directory appears to be a file: '" + p + "'"); }/*from ww w.ja v a 2 s .c o m*/ if (!fs.exists(p)) { log.info("Attempting creation of directory: " + p); if (!fs.mkdirs(p)) { throw new FileSystemMkdirFailed("Failed to create directory: '" + p + "'"); } } }
From source file:com.alexholmes.hdfsslurper.WorkerThread.java
License:Apache License
private void process(FileStatus srcFileStatus) throws IOException, InterruptedException { Path stagingFile = null;//from w w w. j av a 2s . c o m FileSystem destFs = null; String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter(); try { FileSystem srcFs = srcFileStatus.getPath().getFileSystem(config.getConfig()); // run a script which can change the name of the file as well as // write out a new version of the file // if (config.getWorkScript() != null) { Path newSrcFile = stageSource(srcFileStatus); srcFileStatus = srcFileStatus.getPath().getFileSystem(config.getConfig()).getFileStatus(newSrcFile); } Path srcFile = srcFileStatus.getPath(); // get the target HDFS file // Path destFile = getHdfsTargetPath(srcFileStatus); if (config.getCodec() != null) { String ext = config.getCodec().getDefaultExtension(); if (!destFile.getName().endsWith(ext)) { destFile = new Path(destFile.toString() + ext); } } destFs = destFile.getFileSystem(config.getConfig()); // get the staging HDFS file // stagingFile = fileSystemManager.getStagingFile(srcFileStatus, destFile); String batchId = srcFile.toString().substring( srcFile.toString().lastIndexOf(filenameBatchidDelimiter) + 1, srcFile.toString().length()); log.info("event#Copying source file '" + srcFile + "' to staging destination '" + stagingFile + "'" + "$batchId#" + batchId); // if the directory of the target file doesn't exist, attempt to // create it // Path destParentDir = destFile.getParent(); if (!destFs.exists(destParentDir)) { log.info("event#Attempting creation of target directory: " + destParentDir.toUri()); if (!destFs.mkdirs(destParentDir)) { throw new IOException("event#Failed to create target directory: " + destParentDir.toUri()); } } // if the staging directory doesn't exist, attempt to create it // Path destStagingParentDir = stagingFile.getParent(); if (!destFs.exists(destStagingParentDir)) { log.info("event#Attempting creation of staging directory: " + destStagingParentDir.toUri()); if (!destFs.mkdirs(destStagingParentDir)) { throw new IOException("event#Failed to create staging directory: " + destParentDir.toUri()); } } // copy the file // InputStream is = null; OutputStream os = null; CRC32 crc = new CRC32(); try { is = new BufferedInputStream(srcFs.open(srcFile)); if (config.isVerify()) { is = new CheckedInputStream(is, crc); } os = destFs.create(stagingFile); if (config.getCodec() != null) { os = config.getCodec().createOutputStream(os); } IOUtils.copyBytes(is, os, 4096, false); } finally { IOUtils.closeStream(is); IOUtils.closeStream(os); } long srcFileSize = srcFs.getFileStatus(srcFile).getLen(); long destFileSize = destFs.getFileStatus(stagingFile).getLen(); if (config.getCodec() == null && srcFileSize != destFileSize) { throw new IOException( "event#File sizes don't match, source = " + srcFileSize + ", dest = " + destFileSize); } log.info("event#Local file size = " + srcFileSize + ", HDFS file size = " + destFileSize + "$batchId#" + batchId); if (config.isVerify()) { verify(stagingFile, crc.getValue()); } if (destFs.exists(destFile)) { destFs.delete(destFile, false); } log.info("event#Moving staging file '" + stagingFile + "' to destination '" + destFile + "'" + "$batchId#" + batchId); if (!destFs.rename(stagingFile, destFile)) { throw new IOException("event#Failed to rename file"); } if (config.isCreateLzopIndex() && destFile.getName().endsWith(lzopExt)) { Path lzoIndexPath = new Path(destFile.toString() + LzoIndex.LZO_INDEX_SUFFIX); if (destFs.exists(lzoIndexPath)) { log.info("event#Deleting index file as it already exists"); destFs.delete(lzoIndexPath, false); } indexer.index(destFile); } fileSystemManager.fileCopyComplete(srcFileStatus); } catch (Throwable t) { log.error("event#Caught exception working on file " + srcFileStatus.getPath(), t); // delete the staging file if it still exists // try { if (destFs != null && destFs.exists(stagingFile)) { destFs.delete(stagingFile, false); } } catch (Throwable t2) { log.error("event#Failed to delete staging file " + stagingFile, t2); } fileSystemManager.fileCopyError(srcFileStatus); } }
From source file:com.alexholmes.json.mapreduce.ExampleJob.java
License:Apache License
/** * Writes the contents of {@link #JSON} into a file in the job input directory in HDFS. * * @param conf the Hadoop config// w w w . j av a 2 s . co m * @param inputDir the HDFS input directory where we'll write a file * @throws IOException if something goes wrong */ public static void writeInput(Configuration conf, Path inputDir) throws IOException { FileSystem fs = FileSystem.get(conf); if (fs.exists(inputDir)) { throw new IOException( String.format("Input directory '%s' exists - please remove and rerun this example", inputDir)); } OutputStreamWriter writer = new OutputStreamWriter(fs.create(new Path(inputDir, "input.txt"))); writer.write(JSON); IOUtils.closeStream(writer); }
From source file:com.alibaba.jstorm.hdfs.spout.HdfsSpout.java
License:Apache License
private static void validateOrMakeDir(FileSystem fs, Path dir, String dirDescription) { try {/*from w w w . jav a2 s . c om*/ if (fs.exists(dir)) { if (!fs.isDirectory(dir)) { LOG.error(dirDescription + " directory is a file, not a dir. " + dir); throw new RuntimeException(dirDescription + " directory is a file, not a dir. " + dir); } } else if (!fs.mkdirs(dir)) { LOG.error("Unable to create " + dirDescription + " directory " + dir); throw new RuntimeException("Unable to create " + dirDescription + " directory " + dir); } } catch (IOException e) { LOG.error("Unable to create " + dirDescription + " directory " + dir, e); throw new RuntimeException("Unable to create " + dirDescription + " directory " + dir, e); } }
From source file:com.anhth12.lambda.ml.MLUpdate.java
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<String, M> newKeyMessageData, JavaPairRDD<String, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Preconditions.checkNotNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();/*from w w w .j av a2 s.c om*/ newData.foreachPartition(Functions.<Iterator<M>>noOp()); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(Functions.<Iterator<M>>noOp()); } List<HyperParamValues<?>> hyperParamValues = getHyperParamValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); FileSystem fs = FileSystem.get(sparkContext.hadoopConfiguration()); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candiatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); fs.mkdirs(candiatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candiatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { fs.rename(bestCandidatePath, finalPath); } fs.delete(candiatesPath, true); Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { PMML bestModel; try (InputStream in = new GZIPInputStream(fs.open(finalPath), 1 << 16)) { bestModel = PMMLUtils.read(in); } modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); publishAdditionalModelData(sparkContext, bestModel, newData, pastData, candiatesPath, modelUpdateTopic); } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.anhth12.lambda.ml.MLUpdate.java
private Path findBestCandidatePath(JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData, List<List<?>> hyperParameterCombos, Path candiatesPath) throws InterruptedException, IOException { Map<Path, Double> pathToEval = new HashMap<>(candidates); if (evalParallelism > 1) { Collection<Future<Tuple2<Path, Double>>> futures = new ArrayList<>(candidates); ExecutorService executor = Executors.newFixedThreadPool(evalParallelism); try {/*from w ww .j a va2 s . co m*/ for (int i = 0; i < candidates; i++) { futures.add(executor.submit(new BuildAndEvalWorker(i, hyperParameterCombos, sparkContext, newData, pastData, candiatesPath))); } } finally { executor.shutdown(); } for (Future<Tuple2<Path, Double>> future : futures) { Tuple2<Path, Double> pathEval; try { pathEval = future.get(); } catch (ExecutionException ex) { throw new IllegalStateException(ex); } pathToEval.put(pathEval._1, pathEval._2); } } else { for (int i = 0; i < candidates; i++) { Tuple2<Path, Double> pathEval = new BuildAndEvalWorker(i, hyperParameterCombos, sparkContext, newData, pastData, candiatesPath).call(); pathToEval.put(pathEval._1, pathEval._2); } } FileSystem fs = FileSystem.get(sparkContext.hadoopConfiguration()); Path bestCandidatePath = null; double bestEval = Double.NEGATIVE_INFINITY; for (Map.Entry<Path, Double> pathEval : pathToEval.entrySet()) { Path path = pathEval.getKey(); Double eval = pathEval.getValue(); if ((bestCandidatePath == null) || (eval != null && eval > bestEval) && fs.exists(path)) { log.info("Best eval / path is now {} / {}", eval, path); if (eval != null) { bestEval = eval; } bestCandidatePath = path; } } return bestCandidatePath; }
From source file:com.architecting.ch07.MapReduceIndexerTool.java
License:Apache License
/** API for Java clients;visible for testing;may become a public API eventually */ int run(Options options) throws Exception { if (getConf().getBoolean("isMR1", false) && "local".equals(getConf().get("mapred.job.tracker"))) { throw new IllegalStateException( "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported " + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, " + "which is required for passing files via --files and --libjars"); }//from www .j av a 2 s .c om long programStartTime = System.nanoTime(); getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments); // switch off a false warning about allegedly not implementing Tool // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html // also see https://issues.apache.org/jira/browse/HADOOP-8183 getConf().setBoolean("mapred.used.genericoptionsparser", true); if (options.log4jConfigFile != null) { Utils.setLogConfigFile(options.log4jConfigFile, getConf()); addDistributedCacheFile(options.log4jConfigFile, getConf()); } Configuration config = HBaseConfiguration.create(); Job job = Job.getInstance(config); job.setJarByClass(getClass()); // To be able to run this example from eclipse, we need to make sure // the built jar is distributed to the map-reduce tasks from the // local file system. job.addCacheArchive(new URI("file:///home/cloudera/ahae/target/ahae.jar")); FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration()); if (fs.exists(options.outputDir) && !delete(options.outputDir, true, fs)) { return -1; } Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR); Path outputReduceDir = new Path(options.outputDir, "reducers"); int reducers = 1; Scan scan = new Scan(); scan.addFamily(CF); // tag::SETUP[] scan.setCaching(500); // <1> scan.setCacheBlocks(false); // <2> TableMapReduceUtil.initTableMapperJob( // <3> options.inputTable, // Input HBase table name scan, // Scan instance to control what to index HBaseAvroToSOLRMapper.class, // Mapper to parse cells content. Text.class, // Mapper output key SolrInputDocumentWritable.class, // Mapper output value job); FileOutputFormat.setOutputPath(job, outputReduceDir); job.setJobName(getClass().getName() + "/" + Utils.getShortClassName(HBaseAvroToSOLRMapper.class)); job.setReducerClass(SolrReducer.class); // <4> job.setPartitionerClass(SolrCloudPartitioner.class); // <5> job.getConfiguration().set(SolrCloudPartitioner.ZKHOST, options.zkHost); job.getConfiguration().set(SolrCloudPartitioner.COLLECTION, options.collection); job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS, options.shards); job.setOutputFormatClass(SolrOutputFormat.class); SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SolrInputDocumentWritable.class); job.setSpeculativeExecution(false); // end::SETUP[] job.setNumReduceTasks(reducers); // Set the number of reducers based on the number of shards we have. if (!waitForCompletion(job, true)) { return -1;// job failed } // ------------------------------------------------------------------------------------------------------------------------------------- assert reducers == options.shards; // normalize output shard dir prefix, i.e. // rename part-r-00000 to part-00000 (stems from zero tree merge iterations) // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations) for (FileStatus stats : fs.listStatus(outputReduceDir)) { String dirPrefix = SolrOutputFormat.getOutputName(job); Path srcPath = stats.getPath(); if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) { String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length()); Path dstPath = new Path(srcPath.getParent(), dstName); if (!rename(srcPath, dstPath, fs)) { return -1; } } } ; // publish results dir if (!rename(outputReduceDir, outputResultsDir, fs)) { return -1; } if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) { return -1; } goodbye(job, programStartTime); return 0; }
From source file:com.asakusafw.cleaner.main.HDFSCleaner.java
License:Apache License
/** * ?//w w w . j a v a 2 s .c o m * @param fs HDFS? * @param cleanPath HDFS?? * @param isSetExecutionId ID???????? * @param pattern * @param keepDate ?? * @param now ? * @param recursive ???? * @return ? * @throws CleanerSystemException */ private boolean cleanDir(FileSystem fs, Path cleanPath, boolean isSetExecutionId, String pattern, int keepDate, Date now, boolean recursive) throws CleanerSystemException { try { if (!fs.exists(cleanPath)) { // ?????? Log.log(CLASS, MessageIdConst.HCLN_CLEN_DIR_ERROR, "??????", cleanPath.toString()); return false; } if (!fs.getFileStatus(cleanPath).isDir()) { // ?????? Log.log(CLASS, MessageIdConst.HCLN_CLEN_DIR_ERROR, "??????", cleanPath.toString()); return false; } // ? Log.log(CLASS, MessageIdConst.HCLN_FILE_DELETE, cleanPath.toString()); int cleanFileCount = 0; int cleanDirCount = 0; boolean result = true; FileStatus[] dirStatus = getListStatus(fs, cleanPath); Path[] listedPaths = FileUtil.stat2Paths(dirStatus); for (Path path : listedPaths) { FileStatus status = fs.getFileStatus(path); long lastModifiedTime = status.getModificationTime(); if (status.isDir() && recursive) { // ???????? if (isSetExecutionId) { // ID??????MM??????? String executionId = path.getName(); if (isRunningJobFlow(executionId)) { // ??????? Log.log(CLASS, MessageIdConst.HCLN_CLEN_DIR_EXEC, path.toString()); continue; } } FileStatus[] childdirStatus = getListStatus(fs, path); if (childdirStatus.length == 0) { // ??????? if (isExpired(lastModifiedTime, keepDate, now)) { if (!fs.delete(path, false)) { Log.log(CLASS, MessageIdConst.HCLN_CLEN_FAIL, "", path.toString()); result = false; } else { cleanDirCount++; Log.log(CLASS, MessageIdConst.HCLN_DIR_DELETE, path.toString()); } } } else { // ????????? if (cleanDir(fs, path, false, pattern, keepDate, now, recursive)) { // ???????? childdirStatus = getListStatus(fs, path); if (childdirStatus.length == 0) { if (isExpired(lastModifiedTime, keepDate, now)) { if (!fs.delete(path, false)) { Log.log(CLASS, MessageIdConst.HCLN_CLEN_FAIL, "", path.toString()); result = false; } else { cleanDirCount++; Log.log(CLASS, MessageIdConst.HCLN_DIR_DELETE, path.toString()); } } } } else { Log.log(CLASS, MessageIdConst.HCLN_CLEN_FAIL, "", path.toString()); result = false; } } } else if (!status.isDir()) { // ??????????? if (isExpired(lastModifiedTime, keepDate, now) && isMatchPattern(path, pattern)) { if (!fs.delete(path, false)) { Log.log(CLASS, MessageIdConst.HCLN_CLEN_FAIL, "", path.toString()); result = false; } else { Log.log(CLASS, MessageIdConst.HCLN_DELETE_FILE, path.toString()); cleanFileCount++; } } } } Log.log(CLASS, MessageIdConst.HCLN_FILE_DELETE_SUCCESS, cleanPath.toString(), cleanDirCount, cleanFileCount); return result; } catch (IOException e) { Log.log(e, CLASS, MessageIdConst.HCLN_CLEN_DIR_EXCEPTION, cleanPath.getName()); return false; } }
From source file:com.asakusafw.compiler.util.tester.HadoopDriver.java
License:Apache License
/** * Cleans up the temporary working area. * @throws IOException if failed to clean up *//*from w w w. ja v a2s . c om*/ public void clean() throws IOException { logger.info("clean user directory"); Path path = new Path(toPath().toPath('/')); FileSystem fs = path.getFileSystem(configuration); try { if (fs.exists(path)) { fs.delete(path, true); } } catch (IOException e) { logger.info(MessageFormat.format("Failed to fs -rmr {0}", toPath()), e); } }