Example usage for org.apache.hadoop.fs FileSystem exists

List of usage examples for org.apache.hadoop.fs FileSystem exists

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem exists.

Prototype

public boolean exists(Path f) throws IOException 

Source Link

Document

Check if a path exists.

Usage

From source file:com.marklogic.contentpump.OutputArchive.java

License:Apache License

private void newOutputStream() throws IOException {
    // use the constructor filename for the first zip,
    // then add filecount to subsequent archives, if any.
    int count = fileCount.getAndIncrement();
    currPath = newPackagePath(basePath, count, 6);
    if (outputStream != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("closing output archive: " + currPath);
        }/*from   w ww  .  j  ava  2 s .c  o m*/
        outputStream.flush();
        outputStream.close();
    }
    currentFileBytes = 0;
    currentEntries = 0;

    Path zpath = new Path(currPath);
    FileSystem fs = zpath.getFileSystem(conf);
    if (fs.exists(zpath)) {
        throw new IOException(zpath + " already exists.");
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("Creating output archive: " + zpath);
        LOG.debug("Default charset: " + Charset.defaultCharset());
    }
    // if fs instanceof DistributedFileSystem, use hadoop api; otherwise,
    // use java api
    if (fs instanceof DistributedFileSystem) {
        FSDataOutputStream fsout = fs.create(zpath, false);
        outputStream = new ZipOutputStream(fsout);
    } else {
        File f = new File(zpath.toUri().getPath());
        if (!f.exists()) {
            f.getParentFile().mkdirs();
            f.createNewFile();
        }
        FileOutputStream fos = new FileOutputStream(f, false);
        outputStream = new ZipOutputStream(fos);
    }

}

From source file:com.marklogic.mapreduce.LargeBinaryDocument.java

License:Apache License

public byte[] getContentAsByteArray(int offset, int len) {
    FileSystem fs;
    FSDataInputStream is = null;/*from w w  w. ja  va  2  s .c  o m*/
    try {
        fs = path.getFileSystem(conf);
        if (!fs.exists(path)) {
            throw new RuntimeException("File not found: " + path);
        }
        FileStatus status = fs.getFileStatus(path);
        if (status.getLen() < offset) {
            throw new RuntimeException("Reached end of file: " + path);
        }
        byte[] buf = new byte[len];
        is = fs.open(path);
        for (int toSkip = offset, skipped = 0; toSkip < offset; toSkip -= skipped) {
            skipped = is.skipBytes(offset);
        }
        for (int bytesRead = 0; bytesRead < len;) {
            bytesRead += is.read(buf, bytesRead, len - bytesRead);
        }
        return buf;
    } catch (IOException e) {
        throw new RuntimeException("Error accessing file: " + path, e);
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:com.marklogic.mapreduce.LargeBinaryDocument.java

License:Apache License

@Override
public InputStream getContentAsByteStream() {
    FileSystem fs;
    FSDataInputStream is = null;//from  w w w  . j  a v a  2 s  .  co m
    try {
        fs = path.getFileSystem(conf);
        if (!fs.exists(path)) {
            throw new RuntimeException("File not found: " + path);
        }
        is = fs.open(path);
        return is;
    } catch (IOException e) {
        throw new RuntimeException("Error accessing file: " + path, e);
    }
}

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public boolean run() {
    try {//from www.ja v  a2s.  co  m
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = new Job(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            injectSystemProperties(groupByJob);
            groupByJob.setInputFormatClass(TextInputFormat.class);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setJarByClass(DeterminePartitionsJob.class);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = new Job(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        injectSystemProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            dimSelectionJob.setInputFormatClass(TextInputFormat.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setJarByClass(DeterminePartitionsJob.class);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0));
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (fileSystem.exists(partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }

                shardSpecs.put(bucket, actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting)
        throws IOException {
    OutputStream retVal;/*from  www  .  jav a2 s .c o m*/
    FileSystem fs = outputPath.getFileSystem(job.getConfiguration());

    if (fs.exists(outputPath)) {
        if (deleteExisting) {
            fs.delete(outputPath, false);
        } else {
            throw new ISE("outputPath[%s] must not exist.", outputPath);
        }
    }

    if (!FileOutputFormat.getCompressOutput(job)) {
        retVal = fs.create(outputPath, false);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        outputPath = new Path(outputPath.toString() + codec.getDefaultExtension());

        retVal = codec.createOutputStream(fs.create(outputPath, false));
    }

    return retVal;
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

/**
 * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata.
 * This is the method by which we pass the schema types and names directly to pig without having to specify them directly.
 *
 * @param location As passed to relativeToAbsolutePath
 * @param job      The job./*w w w.  ja  v a2s  . c o m*/
 *
 * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist.
 *
 * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
    Configuration conf = job.getConfiguration();
    Properties props = ConfigurationUtil.toProperties(conf);

    // HACK: Here we open the file directly to read the TypeMetadata.
    // HACK: There may be a better more direct way to do this, but it works for now.
    Path path = new Path(location);
    FileSystem fileSystem = path.getFileSystem(conf);

    FileStatus fileStatus = fileSystem.getFileStatus(path);
    if (fileStatus.isDir()) {
        log.debug(String.format("Path is a directory."));
        path = getFilePath(path, fileSystem);
        if (path == null) {
            return null;
        }
    } else if (!fileSystem.exists(path)) {
        return null;
    }

    MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path));
    typeMetadata = reader.getMetadata();
    reader.close();

    if (typeMetadata == null) {
        return null;
    }
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();

    return new ResourceSchema(getMessageSchema(descriptor));
}

From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduceTest.java

/**
 * Test of RunJobAsync method, of class HelloMapReduce.
 * @throws java.lang.Exception//  ww  w  . j  a  va 2  s. c  o  m
 */
@Test
public void testRunJobAsync() throws Exception {
    System.out.println("RunJobAsync");
    LOGGER.info("RunJobAsync");

    FileSystem hdfs = FileSystem.get(hadoopConfig);
    Path outputPath = new Path(wcOutputPathDir);

    // We need to remove the output directory before running the map reduce job.
    if (hdfs.exists(outputPath)) {
        // remove the directory recursively.
        hdfs.delete(outputPath, true);
    }

    Path inputPath = new Path(wcInputPathDir);
    Job result = HelloMapReduce.RunJobAsync(inputPath, outputPath, hadoopConfig);
    boolean ok = result.waitForCompletion(true);
    assertTrue(ok);
}

From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduceTest.java

/**
 * Test of RunJobAnalysisAsync method, of class HelloMapReduce.
 * @throws java.lang.Exception//from w  w  w .  j  a  v a2s.  c o  m
 */
@Test
public void testRunJobAnalysisAsync() throws Exception {
    System.out.println("RunJobAnalysisAsync");
    LOGGER.info("RunJobAnalysisAsync");
    FileSystem hdfs = FileSystem.get(hadoopConfig);
    Path outputPath = new Path(wcOutputAnalysisPathDir);
    if (hdfs.exists(outputPath)) {
        hdfs.delete(outputPath, true);
    }

    Path inputPath = new Path(wcInputPathDir);
    Job result = HelloMapReduce.RunJobAnalysisAsync(inputPath, outputPath, hadoopConfig);
    boolean ok = result.waitForCompletion(true);
    assertTrue(ok);
}

From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduceTest.java

/**
 * Test of main method, of class HelloMapReduce.
 * @throws java.lang.Exception/*  ww  w .j a va2 s . c o  m*/
 */
@Test
public void testMain() throws Exception {
    System.out.println("main");
    LOGGER.info("testMain");
    FileSystem hdfs = FileSystem.get(hadoopConfig);

    Path outputPath = new Path(wcOutputMainPathDir);
    if (hdfs.exists(outputPath)) {
        hdfs.delete(outputPath, true);
    }

    String[] args = { wcInputPathDir, wcOutputMainPathDir };
    HelloMapReduce.main(args);

    // Assume it is true.
    assertTrue(true);
}

From source file:com.ml.ira.algos.LogisticModelParameters.java

License:Apache License

public static LogisticModelParameters loadFrom(Path path) throws IOException {
    FileSystem ofs = path.getFileSystem(new Configuration());
    if (!ofs.exists(path)) {
        throw new IOException(path.toString() + " does not exists. ");
    }/*w w  w. j  a va 2  s  . com*/
    LogisticModelParameters result = new LogisticModelParameters();
    FSDataInputStream in = ofs.open(path);
    result.readFields(in);
    ofs.close();
    return result;
}