Example usage for org.apache.hadoop.fs FileSystem exists

List of usage examples for org.apache.hadoop.fs FileSystem exists

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem exists.

Prototype

public boolean exists(Path f) throws IOException 

Source Link

Document

Check if a path exists.

Usage

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static int run(Configuration conf, Path inputPath, Path topicModelOutputPath, int numTopics,
        int numTerms, double alpha, double eta, int maxIterations, int iterationBlockSize,
        double convergenceDelta, Path dictionaryPath, Path docTopicOutputPath, Path topicModelStateTempPath,
        long randomSeed, float testFraction, int numTrainThreads, int numUpdateThreads, int maxItersPerDoc,
        int numReduceTasks, boolean backfillPerplexity)
        throws ClassNotFoundException, IOException, InterruptedException {
    // verify arguments
    Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0,
            "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction);
    Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0,
            "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction);

    String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) "
            + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, "
            + "topic/term prior {}.  Maximum iterations to run will be {}, unless the change in "
            + "perplexity is less than {}.  Topic model output (p(term|topic) for each topic) will be "
            + "stored {}.  Random initialization seed is {}, holding out {} of the data for perplexity "
            + "check\n";
    log.info(infoString, new Object[] { inputPath, numTerms, numTopics, alpha, eta, maxIterations,
            convergenceDelta, topicModelOutputPath, randomSeed, testFraction });
    infoString = dictionaryPath == null ? ""
            : "Dictionary to be used located " + dictionaryPath.toString() + '\n';
    infoString += docTopicOutputPath == null ? ""
            : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n';
    log.info(infoString);//  w ww  .  j  av a  2  s.  co  m

    FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf);
    int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
    log.info("Current iteration number: {}", iterationNumber);

    conf.set(NUM_TOPICS, String.valueOf(numTopics));
    conf.set(NUM_TERMS, String.valueOf(numTerms));
    conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha));
    conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta));
    conf.set(RANDOM_SEED, String.valueOf(randomSeed));
    conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads));
    conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads));
    conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc));
    conf.set(MODEL_WEIGHT, "1"); // TODO
    conf.set(TEST_SET_FRACTION, String.valueOf(testFraction));

    List<Double> perplexities = Lists.newArrayList();
    for (int i = 1; i <= iterationNumber; i++) {
        // form path to model
        Path modelPath = modelPath(topicModelStateTempPath, i);

        // read perplexity
        double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
        if (Double.isNaN(perplexity)) {
            if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
                continue;
            }
            log.info("Backfilling perplexity at iteration {}", i);
            if (!fs.exists(modelPath)) {
                log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation",
                        modelPath.toString(), i);
                continue;
            }
            perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
        }

        // register and log perplexity
        perplexities.add(perplexity);
        log.info("Perplexity at iteration {} = {}", i, perplexity);
    }

    long startTime = System.currentTimeMillis();
    while (iterationNumber < maxIterations) {
        // test convergence
        if (convergenceDelta > 0.0) {
            double delta = rateOfChange(perplexities);
            if (delta < convergenceDelta) {
                log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
                        new Object[] { iterationNumber, perplexities.get(perplexities.size() - 1), delta });
                break;
            }
        }

        // update model
        iterationNumber++;
        log.info("About to run iteration {} of {}", iterationNumber, maxIterations);
        Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
        Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
        runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations,
                numReduceTasks);

        // calculate perplexity
        if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) {
            perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
            log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
            log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[] { iterationNumber,
                    iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta });
        }
    }
    log.info("Completed {} iterations in {} seconds", iterationNumber,
            (System.currentTimeMillis() - startTime) / 1000);
    log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities));

    // write final topic-term and doc-topic distributions
    Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
    Job topicModelOutputJob = topicModelOutputPath != null
            ? writeTopicModel(conf, finalIterationData, topicModelOutputPath)
            : null;
    Job docInferenceJob = docTopicOutputPath != null
            ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath)
            : null;
    if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) {
        return -1;
    }
    if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) {
        return -1;
    }
    return 0;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

/**
 * @param topicModelStateTemp//from  w w w . j av a 2 s  .  c o  m
 * @param iteration
 * @return {@code double[2]} where first value is perplexity and second is model weight of those
 *         documents sampled during perplexity computation, or {@code null} if no perplexity data
 *         exists for the given iteration.
 * @throws IOException
 */
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
        throws IOException {
    Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
    FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
    if (!fs.exists(perplexityPath)) {
        log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
        return Double.NaN;
    }
    double perplexity = 0;
    double modelWeight = 0;
    long n = 0;
    for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
            perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        modelWeight += pair.getFirst().get();
        perplexity += pair.getSecond().get();
        n++;
    }
    log.info("Read {} entries with total perplexity {} and model weight {}",
            new Object[] { n, perplexity, modelWeight });
    return perplexity / modelWeight;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setMapperClass(CVB0DocInferenceMapper.class);
    job.setNumReduceTasks(0);/*  w  w  w . j a v a2s  .c  om*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
    if (modelInput != null && fs.exists(modelInput)) {
        FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
        URI[] modelUris = new URI[statuses.length];
        for (int i = 0; i < statuses.length; i++) {
            modelUris[i] = statuses[i].getPath().toUri();
        }
        DistributedCache.setCacheFiles(modelUris, conf);
    }
    setModelPaths(job, modelInput);//bug:mahout-1147
    FileInputFormat.addInputPath(job, corpus);
    FileOutputFormat.setOutputPath(job, output);
    job.setJarByClass(CVB0Driver.class);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static int getCurrentIterationNumber(Configuration config, Path modelTempDir, int maxIterations)
        throws IOException {
    FileSystem fs = FileSystem.get(modelTempDir.toUri(), config);
    int iterationNumber = 1;
    Path iterationPath = modelPath(modelTempDir, iterationNumber);
    while (fs.exists(iterationPath) && iterationNumber <= maxIterations) {
        log.info("Found previous state: " + iterationPath);
        iterationNumber++;//  w  w  w  . j av a 2s.  co  m
        iterationPath = modelPath(modelTempDir, iterationNumber);
    }
    return iterationNumber - 1;
}

From source file:com.ema.hadoop.test_hdfs.TestWrite.java

public static void main(String[] args) throws IOException, URISyntaxException {

    Configuration configuration = new Configuration();
    FileSystem hdfs = FileSystem.get(new URI("hdfs://localhost:9000"), configuration);
    Path file = new Path("hdfs://localhost:9000/user/student/text_file_write.txt");
    if (hdfs.exists(file)) {
        hdfs.delete(file, true);//from w  ww  . j  a va 2 s.  c o  m
    }
    OutputStream os = hdfs.create(file, new Progressable() {
        @Override
        public void progress() {
            out.println("...bytes written");
        }
    });
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
    br.write("This is just a test to check if it is possible to write a file on HDFS using the Java API");
    br.close();
    hdfs.close();

}

From source file:com.ema.hadoop.wordcount.WordCount_cache.java

public static void main(String[] args) throws Exception {

    if (args.length != 2) {
        System.err.println("Usage: WordCount <input path> <output path>");
        System.exit(-1);//from w  w  w .j a va  2  s . com
    }

    // First we write the stop word list
    // it could also be a file manually loaded into HDFS

    String[] stopwords = { "the", "a" };
    Configuration configuration = new Configuration();
    FileSystem hdfs = FileSystem.get(new URI("hdfs://localhost:9000"), configuration);
    Path file = new Path("hdfs://localhost:9000/user/student/stop_words.txt");
    if (hdfs.exists(file)) {
        hdfs.delete(file, true);
    }
    OutputStream os = hdfs.create(file, new Progressable() {
        @Override
        public void progress() {
            out.println("...bytes written");
        }
    });
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
    for (String w : stopwords) {
        br.write(w + "\n");
    }

    br.close();
    hdfs.close();

    Job job = Job.getInstance();
    job.addCacheFile(new Path("hdfs://localhost:9000/user/student/stop_words.txt").toUri());

    job.setJarByClass(WordCount_cache.class);
    job.setJobName("Word count job");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(WCMapper_cache.class);
    job.setReducerClass(WCReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.endgame.binarypig.util.BuildSequenceFileFromArchive.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    File inDirOrFile = new File(args[0]);
    Path outputDir = new Path(args[1]);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(outputDir)) {
        fs.mkdirs(outputDir);/*from  w  w  w .ja va2 s .co m*/
    }

    if (inDirOrFile.isFile()) {
        load(fs, conf, inDirOrFile, outputDir);
    } else {
        for (File file : inDirOrFile.listFiles()) {
            if (!file.isFile()) {
                System.out.println("Skipping " + file + " (not a file) ...");
                continue;
            }

            load(fs, conf, file, outputDir);
        }
    }

    return 0;
}

From source file:com.ery.dimport.daemon.TaskManager.java

License:Apache License

public void runTask(final TaskInfo task) {
    List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>();
    try {/*from  w ww  .  ja  v a  2 s.  c  o  m*/
        task.START_TIME = new Date(System.currentTimeMillis());
        boolean needUpdate = false;
        TaskInfo exists = allTask.get(task.TASK_ID);
        if (exists == null) {
            needUpdate = true;
        } else {
            task.hosts = exists.hosts;
        }
        if (task.hosts == null || task.hosts.size() == 0) {
            task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet());
            needUpdate = true;
        }
        if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) {
            needUpdate = true;
        }
        if (needUpdate) {
            try {
                task.HOST_SIZE = task.hosts.size();
                master.logWriter.writeLog(task);
                ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID,
                        DImportConstant.Serialize(task));
            } catch (Throwable e) {
            }
        }
        Thread thread = Thread.currentThread();
        ProcessInfo procInfo = null;
        synchronized (taskInProgress) {
            procInfo = taskInProgress.get(task.getRunTaskId());
        }
        procInfo.thread = thread;
        procInfo.startTime = System.currentTimeMillis();
        procInfo.startTime = System.currentTimeMillis();
        String filePath = task.FILE_PATH;
        boolean isInHdfs = false;
        final Map<String, Long> files = new HashMap<String, Long>();
        String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home"));
        if (tmpPath.endsWith("/")) {
            tmpPath = tmpPath.substring(0, tmpPath.length() - 1);
        }
        if (filePath == null || filePath.equals("")) {
            files.put("", 0l);
        } else {
            if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) {
                task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER);
                task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName);
                task.fileNamePattern = Pattern.compile(task.FILE_FILTER);
            }
            Matcher m = hdfsUrlPattern.matcher(filePath);
            if (m.matches()) {
                isInHdfs = true;
                filePath = m.group(2);
                // for (String string : conf.getValByRegex(".*").keySet()) {
                // System.out.println(string + "=" + conf.get(string));
                // }
                Path dirPath = new Path(filePath);
                FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) {
                    throw new IOException("HDFS? " + filePath + "?,?");
                }
                FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() {
                    @Override
                    public boolean accept(Path name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName());
                            return task.fileNamePattern.matcher(name.getName()).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < hFiles.length; i++) {
                    files.put(hFiles[i].getPath().toString(), hFiles[i].getLen());
                }
            } else {
                java.io.File f = new File(filePath);
                if (!f.exists() || !f.isDirectory()) {
                    throw new IOException(
                            "? " + filePath + "? ,?");
                }
                File[] lFiles = f.listFiles(new FilenameFilter() {
                    public boolean accept(File dir, String name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("local fs listStatus:" + dir + "/" + name);
                            return task.fileNamePattern.matcher(name).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < lFiles.length; i++) {
                    files.put(lFiles[i].getAbsolutePath(), lFiles[i].length());
                }
            }
        }
        for (String fileName : files.keySet()) {
            LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task);
            runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_"
                    + fileName.hashCode();
            runInfo.FILE_NAME = fileName;
            runInfo.RETURN_CODE = 255;
            runInfo.IS_RUN_SUCCESS = -1;
            runInfo.FILE_SIZE = files.get(fileName);
            runInfo.HOST_NAME = master.hostName;
            String localFile = fileName;
            if (isInHdfs) {// 
                localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
            }
            // 
            String[] cmds = procInfo.task.getCommand();
            for (int j = 0; j < cmds.length; j++) {
                cmds[j] = DImportConstant.macroProcess(cmds[j]);
                cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile);
                cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName);
            }
            runInfo.RUN_COMMAND = StringUtils.join(" ", cmds);
            master.logWriter.writeLog(runInfo);
            LOG.info("??" + runInfo);
            allFiles.add(runInfo);
        }
        ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                DImportConstant.Serialize(allFiles));
        for (LogHostRunInfoPO runInfo : allFiles) {
            if (procInfo.stoped)
                break;
            String fileName = runInfo.FILE_NAME;
            LOG.info("?:" + fileName);
            procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID;
            runInfo.START_TIME = new Date(System.currentTimeMillis());
            procInfo.processFile = fileName;
            String localFile = fileName;
            try {
                if (isInHdfs) {// 
                    localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
                }
                procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND;
                if (isInHdfs) {// 
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                    FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                    LOG.info("HDFS:" + fileName + "===>" + localFile);
                    long btime = System.currentTimeMillis();
                    fs.copyToLocalFile(new Path(fileName), new Path(localFile));
                    LOG.info("HDFS?:" + fileName + "===>" + localFile);
                    runInfo.downTime = System.currentTimeMillis() - btime;
                    fileName = localFile;
                }
                updateHostInfoLog(runInfo, allFiles);
                LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND);
                procInfo.proc = execResult(runInfo.RUN_COMMAND);
                runInfo.IS_RUN_SUCCESS = 1;
                runInfo.RETURN_CODE = writeProcessLog(procInfo);
                LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE);
                // runInfo.RETURN_CODE = procInfo.proc.exitValue();
            } catch (Throwable e) {
                runInfo.ERROR_MSG = e.getMessage();
                if (procInfo.proc != null) {
                    try {
                        procInfo.proc.destroy();
                    } catch (Exception ex) {
                    }
                }
                procInfo.proc = null;
                LOG.error("", e);
            } finally { // 
                runInfo.END_TIME = new Date(System.currentTimeMillis());
                master.logWriter.updateLog(runInfo);
                updateHostInfoLog(runInfo, allFiles);
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
                if (isInHdfs) {
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                }
            }
        }
    } catch (Throwable e) {
        LOG.error("" + task, e);
        try {
            if (allFiles.size() > 0) {
                for (LogHostRunInfoPO logHostRunInfoPO : allFiles) {
                    if (logHostRunInfoPO.END_TIME.getTime() < 10000) {
                        logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis());
                        logHostRunInfoPO.IS_RUN_SUCCESS = 1;
                        logHostRunInfoPO.RETURN_CODE = 2;
                    }
                }
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
            }
        } catch (KeeperException e1) {
            LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        } catch (IOException e1) {
            LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        }
    } finally { // 
        synchronized (taskInProgress) {
            taskInProgress.remove(task.getRunTaskId());
        }
    }
}

From source file:com.facebook.hiveio.common.FileSystems.java

License:Apache License

/**
 * Move a file or directory from source to destination, recursively copying
 * subdirectories.//from   w w  w  .j ava 2  s.  co m
 *
 * @param fs FileSystem
 * @param file path to copy (file or directory)
 * @param src path to source directory
 * @param dest path to destination directory
 * @throws IOException I/O problems
 */
public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException {
    Path destFilePath = pathInDestination(file, src, dest);
    if (fs.isFile(file)) {
        if (fs.exists(destFilePath)) {
            if (!fs.delete(destFilePath, true)) {
                throw new IllegalArgumentException("Could not remove existing file " + destFilePath);
            }
        }
        if (!fs.rename(file, destFilePath)) {
            throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath);
        }
    } else if (fs.getFileStatus(file).isDir()) {
        FileStatus[] statuses = fs.listStatus(file);
        fs.mkdirs(destFilePath);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                move(fs, status.getPath(), src, dest);
            }
        }
    }
}

From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java

License:Apache License

/**
 * Write success file to Hadoop if required
 *
 * @param conf Configuration//  w w w  . j  a va  2  s.co m
 * @throws IOException I/O errors
 */
private static void writeSuccessFile(Configuration conf) throws IOException {
    if (!HadoopUtils.needSuccessMarker(conf)) {
        return;
    }
    Path outputPath = HadoopUtils.getOutputPath(conf);
    FileSystem fs = outputPath.getFileSystem(conf);
    if (fs.exists(outputPath)) {
        Path successPath = new Path(outputPath, "_SUCCESS");
        if (!fs.exists(successPath)) {
            LOG.info("Writing success file to {}", successPath);
            fs.create(successPath).close();
        }
    }
}