List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
public static int run(Configuration conf, Path inputPath, Path topicModelOutputPath, int numTopics, int numTerms, double alpha, double eta, int maxIterations, int iterationBlockSize, double convergenceDelta, Path dictionaryPath, Path docTopicOutputPath, Path topicModelStateTempPath, long randomSeed, float testFraction, int numTrainThreads, int numUpdateThreads, int maxItersPerDoc, int numReduceTasks, boolean backfillPerplexity) throws ClassNotFoundException, IOException, InterruptedException { // verify arguments Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0, "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction); Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0, "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction); String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) " + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, " + "topic/term prior {}. Maximum iterations to run will be {}, unless the change in " + "perplexity is less than {}. Topic model output (p(term|topic) for each topic) will be " + "stored {}. Random initialization seed is {}, holding out {} of the data for perplexity " + "check\n"; log.info(infoString, new Object[] { inputPath, numTerms, numTopics, alpha, eta, maxIterations, convergenceDelta, topicModelOutputPath, randomSeed, testFraction }); infoString = dictionaryPath == null ? "" : "Dictionary to be used located " + dictionaryPath.toString() + '\n'; infoString += docTopicOutputPath == null ? "" : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n'; log.info(infoString);// w ww . j av a 2 s. co m FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf); int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); log.info("Current iteration number: {}", iterationNumber); conf.set(NUM_TOPICS, String.valueOf(numTopics)); conf.set(NUM_TERMS, String.valueOf(numTerms)); conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha)); conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta)); conf.set(RANDOM_SEED, String.valueOf(randomSeed)); conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads)); conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads)); conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc)); conf.set(MODEL_WEIGHT, "1"); // TODO conf.set(TEST_SET_FRACTION, String.valueOf(testFraction)); List<Double> perplexities = Lists.newArrayList(); for (int i = 1; i <= iterationNumber; i++) { // form path to model Path modelPath = modelPath(topicModelStateTempPath, i); // read perplexity double perplexity = readPerplexity(conf, topicModelStateTempPath, i); if (Double.isNaN(perplexity)) { if (!(backfillPerplexity && i % iterationBlockSize == 0)) { continue; } log.info("Backfilling perplexity at iteration {}", i); if (!fs.exists(modelPath)) { log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation", modelPath.toString(), i); continue; } perplexity = calculatePerplexity(conf, inputPath, modelPath, i); } // register and log perplexity perplexities.add(perplexity); log.info("Perplexity at iteration {} = {}", i, perplexity); } long startTime = System.currentTimeMillis(); while (iterationNumber < maxIterations) { // test convergence if (convergenceDelta > 0.0) { double delta = rateOfChange(perplexities); if (delta < convergenceDelta) { log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", new Object[] { iterationNumber, perplexities.get(perplexities.size() - 1), delta }); break; } } // update model iterationNumber++; log.info("About to run iteration {} of {}", iterationNumber, maxIterations); Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations, numReduceTasks); // calculate perplexity if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) { perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[] { iterationNumber, iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta }); } } log.info("Completed {} iterations in {} seconds", iterationNumber, (System.currentTimeMillis() - startTime) / 1000); log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities)); // write final topic-term and doc-topic distributions Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); Job topicModelOutputJob = topicModelOutputPath != null ? writeTopicModel(conf, finalIterationData, topicModelOutputPath) : null; Job docInferenceJob = docTopicOutputPath != null ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath) : null; if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) { return -1; } if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) { return -1; } return 0; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
/** * @param topicModelStateTemp//from w w w . j av a 2 s . c o m * @param iteration * @return {@code double[2]} where first value is perplexity and second is model weight of those * documents sampled during perplexity computation, or {@code null} if no perplexity data * exists for the given iteration. * @throws IOException */ public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration) throws IOException { Path perplexityPath = perplexityPath(topicModelStateTemp, iteration); FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf); if (!fs.exists(perplexityPath)) { log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath); return Double.NaN; } double perplexity = 0; double modelWeight = 0; long n = 0; for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>( perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { modelWeight += pair.getFirst().get(); perplexity += pair.getSecond().get(); n++; } log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n, perplexity, modelWeight }); return perplexity / modelWeight; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0);/* w w w . j a v a2s .c om*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static int getCurrentIterationNumber(Configuration config, Path modelTempDir, int maxIterations) throws IOException { FileSystem fs = FileSystem.get(modelTempDir.toUri(), config); int iterationNumber = 1; Path iterationPath = modelPath(modelTempDir, iterationNumber); while (fs.exists(iterationPath) && iterationNumber <= maxIterations) { log.info("Found previous state: " + iterationPath); iterationNumber++;// w w w . j av a 2s. co m iterationPath = modelPath(modelTempDir, iterationNumber); } return iterationNumber - 1; }
From source file:com.ema.hadoop.test_hdfs.TestWrite.java
public static void main(String[] args) throws IOException, URISyntaxException { Configuration configuration = new Configuration(); FileSystem hdfs = FileSystem.get(new URI("hdfs://localhost:9000"), configuration); Path file = new Path("hdfs://localhost:9000/user/student/text_file_write.txt"); if (hdfs.exists(file)) { hdfs.delete(file, true);//from w ww . j a va 2 s. c o m } OutputStream os = hdfs.create(file, new Progressable() { @Override public void progress() { out.println("...bytes written"); } }); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8")); br.write("This is just a test to check if it is possible to write a file on HDFS using the Java API"); br.close(); hdfs.close(); }
From source file:com.ema.hadoop.wordcount.WordCount_cache.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: WordCount <input path> <output path>"); System.exit(-1);//from w w w .j a va 2 s . com } // First we write the stop word list // it could also be a file manually loaded into HDFS String[] stopwords = { "the", "a" }; Configuration configuration = new Configuration(); FileSystem hdfs = FileSystem.get(new URI("hdfs://localhost:9000"), configuration); Path file = new Path("hdfs://localhost:9000/user/student/stop_words.txt"); if (hdfs.exists(file)) { hdfs.delete(file, true); } OutputStream os = hdfs.create(file, new Progressable() { @Override public void progress() { out.println("...bytes written"); } }); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8")); for (String w : stopwords) { br.write(w + "\n"); } br.close(); hdfs.close(); Job job = Job.getInstance(); job.addCacheFile(new Path("hdfs://localhost:9000/user/student/stop_words.txt").toUri()); job.setJarByClass(WordCount_cache.class); job.setJobName("Word count job"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(WCMapper_cache.class); job.setReducerClass(WCReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.endgame.binarypig.util.BuildSequenceFileFromArchive.java
License:Apache License
@Override public int run(String[] args) throws Exception { File inDirOrFile = new File(args[0]); Path outputDir = new Path(args[1]); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); if (!fs.exists(outputDir)) { fs.mkdirs(outputDir);/*from w w w .ja va2 s .co m*/ } if (inDirOrFile.isFile()) { load(fs, conf, inDirOrFile, outputDir); } else { for (File file : inDirOrFile.listFiles()) { if (!file.isFile()) { System.out.println("Skipping " + file + " (not a file) ..."); continue; } load(fs, conf, file, outputDir); } } return 0; }
From source file:com.ery.dimport.daemon.TaskManager.java
License:Apache License
public void runTask(final TaskInfo task) { List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>(); try {/*from w ww . ja v a 2 s. c o m*/ task.START_TIME = new Date(System.currentTimeMillis()); boolean needUpdate = false; TaskInfo exists = allTask.get(task.TASK_ID); if (exists == null) { needUpdate = true; } else { task.hosts = exists.hosts; } if (task.hosts == null || task.hosts.size() == 0) { task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet()); needUpdate = true; } if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) { needUpdate = true; } if (needUpdate) { try { task.HOST_SIZE = task.hosts.size(); master.logWriter.writeLog(task); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID, DImportConstant.Serialize(task)); } catch (Throwable e) { } } Thread thread = Thread.currentThread(); ProcessInfo procInfo = null; synchronized (taskInProgress) { procInfo = taskInProgress.get(task.getRunTaskId()); } procInfo.thread = thread; procInfo.startTime = System.currentTimeMillis(); procInfo.startTime = System.currentTimeMillis(); String filePath = task.FILE_PATH; boolean isInHdfs = false; final Map<String, Long> files = new HashMap<String, Long>(); String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home")); if (tmpPath.endsWith("/")) { tmpPath = tmpPath.substring(0, tmpPath.length() - 1); } if (filePath == null || filePath.equals("")) { files.put("", 0l); } else { if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) { task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER); task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName); task.fileNamePattern = Pattern.compile(task.FILE_FILTER); } Matcher m = hdfsUrlPattern.matcher(filePath); if (m.matches()) { isInHdfs = true; filePath = m.group(2); // for (String string : conf.getValByRegex(".*").keySet()) { // System.out.println(string + "=" + conf.get(string)); // } Path dirPath = new Path(filePath); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) { throw new IOException("HDFS? " + filePath + "?,?"); } FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() { @Override public boolean accept(Path name) { if (task.fileNamePattern != null) { System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName()); return task.fileNamePattern.matcher(name.getName()).matches(); } else { return true; } } }); for (int i = 0; i < hFiles.length; i++) { files.put(hFiles[i].getPath().toString(), hFiles[i].getLen()); } } else { java.io.File f = new File(filePath); if (!f.exists() || !f.isDirectory()) { throw new IOException( "? " + filePath + "? ,?"); } File[] lFiles = f.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { if (task.fileNamePattern != null) { System.out.println("local fs listStatus:" + dir + "/" + name); return task.fileNamePattern.matcher(name).matches(); } else { return true; } } }); for (int i = 0; i < lFiles.length; i++) { files.put(lFiles[i].getAbsolutePath(), lFiles[i].length()); } } } for (String fileName : files.keySet()) { LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task); runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_" + fileName.hashCode(); runInfo.FILE_NAME = fileName; runInfo.RETURN_CODE = 255; runInfo.IS_RUN_SUCCESS = -1; runInfo.FILE_SIZE = files.get(fileName); runInfo.HOST_NAME = master.hostName; String localFile = fileName; if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } // String[] cmds = procInfo.task.getCommand(); for (int j = 0; j < cmds.length; j++) { cmds[j] = DImportConstant.macroProcess(cmds[j]); cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile); cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName); } runInfo.RUN_COMMAND = StringUtils.join(" ", cmds); master.logWriter.writeLog(runInfo); LOG.info("??" + runInfo); allFiles.add(runInfo); } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); for (LogHostRunInfoPO runInfo : allFiles) { if (procInfo.stoped) break; String fileName = runInfo.FILE_NAME; LOG.info("?:" + fileName); procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID; runInfo.START_TIME = new Date(System.currentTimeMillis()); procInfo.processFile = fileName; String localFile = fileName; try { if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND; if (isInHdfs) {// File lf = new File(localFile); if (lf.exists()) lf.delete(); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); LOG.info("HDFS:" + fileName + "===>" + localFile); long btime = System.currentTimeMillis(); fs.copyToLocalFile(new Path(fileName), new Path(localFile)); LOG.info("HDFS?:" + fileName + "===>" + localFile); runInfo.downTime = System.currentTimeMillis() - btime; fileName = localFile; } updateHostInfoLog(runInfo, allFiles); LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND); procInfo.proc = execResult(runInfo.RUN_COMMAND); runInfo.IS_RUN_SUCCESS = 1; runInfo.RETURN_CODE = writeProcessLog(procInfo); LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE); // runInfo.RETURN_CODE = procInfo.proc.exitValue(); } catch (Throwable e) { runInfo.ERROR_MSG = e.getMessage(); if (procInfo.proc != null) { try { procInfo.proc.destroy(); } catch (Exception ex) { } } procInfo.proc = null; LOG.error("", e); } finally { // runInfo.END_TIME = new Date(System.currentTimeMillis()); master.logWriter.updateLog(runInfo); updateHostInfoLog(runInfo, allFiles); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); if (isInHdfs) { File lf = new File(localFile); if (lf.exists()) lf.delete(); } } } } catch (Throwable e) { LOG.error("" + task, e); try { if (allFiles.size() > 0) { for (LogHostRunInfoPO logHostRunInfoPO : allFiles) { if (logHostRunInfoPO.END_TIME.getTime() < 10000) { logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis()); logHostRunInfoPO.IS_RUN_SUCCESS = 1; logHostRunInfoPO.RETURN_CODE = 2; } } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); } } catch (KeeperException e1) { LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } catch (IOException e1) { LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } } finally { // synchronized (taskInProgress) { taskInProgress.remove(task.getRunTaskId()); } } }
From source file:com.facebook.hiveio.common.FileSystems.java
License:Apache License
/** * Move a file or directory from source to destination, recursively copying * subdirectories.//from w w w .j ava 2 s. co m * * @param fs FileSystem * @param file path to copy (file or directory) * @param src path to source directory * @param dest path to destination directory * @throws IOException I/O problems */ public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException { Path destFilePath = pathInDestination(file, src, dest); if (fs.isFile(file)) { if (fs.exists(destFilePath)) { if (!fs.delete(destFilePath, true)) { throw new IllegalArgumentException("Could not remove existing file " + destFilePath); } } if (!fs.rename(file, destFilePath)) { throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath); } } else if (fs.getFileStatus(file).isDir()) { FileStatus[] statuses = fs.listStatus(file); fs.mkdirs(destFilePath); if (statuses != null) { for (FileStatus status : statuses) { move(fs, status.getPath(), src, dest); } } } }
From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java
License:Apache License
/** * Write success file to Hadoop if required * * @param conf Configuration// w w w . j a va 2 s.co m * @throws IOException I/O errors */ private static void writeSuccessFile(Configuration conf) throws IOException { if (!HadoopUtils.needSuccessMarker(conf)) { return; } Path outputPath = HadoopUtils.getOutputPath(conf); FileSystem fs = outputPath.getFileSystem(conf); if (fs.exists(outputPath)) { Path successPath = new Path(outputPath, "_SUCCESS"); if (!fs.exists(successPath)) { LOG.info("Writing success file to {}", successPath); fs.create(successPath).close(); } } }