Example usage for org.apache.hadoop.fs FileSystem exists

List of usage examples for org.apache.hadoop.fs FileSystem exists

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem exists.

Prototype

public boolean exists(Path f) throws IOException 

Source Link

Document

Check if a path exists.

Usage

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Injector.java

public void inject(Path crawlDir, ArrayList<String> urls)
        throws IOException, InterruptedException, ClassNotFoundException, Exception {
    Path crawldb = new Path(crawlDir, "crawldb");
    Configuration config = CrawlerConfiguration.create();
    System.out.println(config.get("mapred.jar"));
    FileSystem fs = crawldb.getFileSystem(config);
    Path tempdb = new Path(crawldb, "temp");
    if (fs.exists(tempdb)) {
        fs.delete(tempdb);/*w w w . ja v  a  2s  . c  om*/
    }

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, new Path(tempdb, "info.avro"), Text.class,
            CrawlDatum.class);
    for (String url : urls) {
        CrawlDatum crawldatum = new CrawlDatum();
        crawldatum.setUrl(url);
        crawldatum.setStatus(CrawlDatum.STATUS_DB_INJECTED);
        writer.append(new Text(url), crawldatum);
        System.out.println("inject:" + url);
    }
    writer.close();

    String[] args = new String[] { crawldb.toString(), tempdb.toString() };

    ToolRunner.run(CrawlerConfiguration.create(), new Merge(), args);
    Merge.install(crawldb);

    if (fs.exists(tempdb)) {
        fs.delete(tempdb);
    }

}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }// w w w .j  av a  2 s . c om

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static void install(Path crawldb) throws IOException {
    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");
    Path olddb = new Path(crawldb, "old");
    if (fs.exists(currentdb)) {
        if (fs.exists(olddb)) {
            fs.delete(olddb);/*from  w  ww  . j  a va  2s.  c o m*/
        }
        fs.rename(currentdb, olddb);
    }
    fs.mkdirs(crawldb);
    fs.rename(newdb, currentdb);
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Run a map/reduce job to compute Pi. */
private static void compute(int startDigit, int nDigits, int nMaps, String workingDir, Configuration conf,
        PrintStream out) throws IOException {
    final String name = startDigit + "_" + nDigits;

    // setup wroking directory
    out.println("Working Directory = " + workingDir);
    out.println();/*from   ww w  .ja  v  a 2 s  .c  o m*/
    // final FileSystem fs = FileSystem.get(conf);// ?
    final FileSystem fs = new Path(workingDir, "part-r-00000").getFileSystem(conf);// ?
    final Path dir = fs.makeQualified(new Path(workingDir));
    if (fs.exists(dir)) {
        throw new IOException("Working directory " + dir + " already exists.  Please remove it first.");
    } else if (!fs.mkdirs(dir)) {
        throw new IOException("Cannot create working directory " + dir);
    }

    out.println("Start Digit      = " + startDigit);
    out.println("Number of Digits = " + nDigits);
    out.println("Number of Maps   = " + nMaps);

    // setup a job
    final Job job = createJob(name, conf);
    final Path hexfile = new Path(dir, "pi_" + name + ".hex");
    FileOutputFormat.setOutputPath(job, new Path(dir, "out"));

    // setup custom properties
    job.getConfiguration().set(WORKING_DIR_PROPERTY, dir.toString());
    job.getConfiguration().set(HEX_FILE_PROPERTY, hexfile.toString());

    job.getConfiguration().setInt(DIGIT_START_PROPERTY, startDigit);
    job.getConfiguration().setInt(DIGIT_SIZE_PROPERTY, nDigits);
    job.getConfiguration().setInt(DIGIT_PARTS_PROPERTY, nMaps);

    // start a map/reduce job
    out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        out.println("Duration is " + duration + " seconds.");
    }
    out.println("Output file: " + hexfile);
}

From source file:cn.jpush.hdfs.mr.example.WordMedian.java

License:Apache License

/**
 * This is a standard program to read and find a median value based on a
 * file of word counts such as: 1 456, 2 132, 3 56... Where the first values
 * are the word lengths and the following values are the number of times
 * that words of that length appear./*w w w .j a v a  2 s .  c  o  m*/
 * 
 * @param path
 *            The path to read the HDFS file from
 *            (part-r-00000...00001...etc).
 * @param medianIndex1
 *            The first length value to look for.
 * @param medianIndex2
 *            The second length value to look for (will be the same as the
 *            first if there are an even number of words total).
 * @throws IOException
 *             If file cannot be found, we throw an exception.
 * */
private double readAndFindMedian(String path, int medianIndex1, int medianIndex2, Configuration conf)
        throws IOException {
    // FileSystem fs = FileSystem.get(conf);// ?
    FileSystem fs = new Path(path, "part-r-00000").getFileSystem(conf);// ?
    Path file = new Path(path, "part-r-00000");

    if (!fs.exists(file))
        throw new IOException("Output not found!");

    BufferedReader br = null;

    try {
        br = new BufferedReader(new InputStreamReader(fs.open(file), Charsets.UTF_8));
        int num = 0;

        String line;
        while ((line = br.readLine()) != null) {
            StringTokenizer st = new StringTokenizer(line);

            // grab length
            String currLen = st.nextToken();

            // grab count
            String lengthFreq = st.nextToken();

            int prevNum = num;
            num += Integer.parseInt(lengthFreq);

            if (medianIndex2 >= prevNum && medianIndex1 <= num) {
                System.out.println("The median is: " + currLen);
                br.close();
                return Double.parseDouble(currLen);
            } else if (medianIndex2 >= prevNum && medianIndex1 < num) {
                String nextCurrLen = st.nextToken();
                double theMedian = (Integer.parseInt(currLen) + Integer.parseInt(nextCurrLen)) / 2.0;
                System.out.println("The median is: " + theMedian);
                br.close();
                return theMedian;
            }
        }
    } finally {
        if (br != null) {
            br.close();
        }
    }
    // error, no median found
    return -1;
}

From source file:co.cask.cdap.common.logging.SyncTest.java

License:Apache License

@Test
@Ignore/*from w w  w  .  ja va 2s. c  om*/
public void testSync() throws IOException {
    FileSystem fs = FileSystem.get(config);
    // create a file and write n bytes, then sync
    Path path = new Path("/myfile");
    FSDataOutputStream out = fs.create(path, false, 4096, (short) 2, 4096L);
    int numBytes = 5000;
    for (int i = 0; i < numBytes; i++) {
        out.write((byte) i);
    }
    out.hflush();
    // verify the file is there
    Assert.assertTrue(fs.exists(path));
    // do not verify the length of the file, hflush() does not update that
    //Assert.assertEquals(numBytes, fs.getFileStatus(path).getLen());
    // read back and verify all bytes
    FSDataInputStream in = fs.open(path);
    byte[] buffer = new byte[numBytes];
    in.readFully(buffer);
    for (int i = 0; i < numBytes; i++) {
        Assert.assertEquals((byte) i, buffer[i]);
    }
    in.close();
    // now close the writer
    out.close();
}

From source file:co.cask.cdap.data.hbase.HBase10CDH550Test.java

License:Apache License

@Override
public HRegion createHRegion(byte[] tableName, byte[] startKey, byte[] stopKey, String callingMethod,
        Configuration conf, byte[]... families) throws IOException {
    if (conf == null) {
        conf = new Configuration();
    }/*from   w w  w  .j av  a 2 s.c  om*/
    HTableDescriptor htd = new HTableDescriptor(tableName);
    for (byte[] family : families) {
        htd.addFamily(new HColumnDescriptor(family));
    }
    HRegionInfo info = new HRegionInfo(htd.getTableName(), startKey, stopKey, false);
    Path path = new Path(conf.get(HConstants.HBASE_DIR), callingMethod);
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(path)) {
        if (!fs.delete(path, true)) {
            throw new IOException("Failed delete of " + path);
        }
    }
    return HRegion.createHRegion(info, path, conf, htd);
}

From source file:co.cask.cdap.data.hbase.HBase94Test.java

License:Apache License

@Override
public HRegion createHRegion(byte[] tableName, byte[] startKey, byte[] stopKey, String callingMethod,
        Configuration conf, byte[]... families) throws IOException {
    if (conf == null) {
        conf = new Configuration();
    }/*from w ww  . ja  va 2  s .c  om*/
    HTableDescriptor htd = new HTableDescriptor(tableName);
    for (byte[] family : families) {
        htd.addFamily(new HColumnDescriptor(family));
    }
    HRegionInfo info = new HRegionInfo(htd.getName(), startKey, stopKey, false);
    Path path = new Path(conf.get(HConstants.HBASE_DIR), callingMethod);
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(path)) {
        if (!fs.delete(path, true)) {
            throw new IOException("Failed delete of " + path);
        }
    }
    return HRegion.createHRegion(info, path, conf, htd);
}

From source file:co.cask.cdap.data.tools.HBaseTableExporter.java

License:Apache License

/**
 * Sets up the actual MapReduce job.//from  w  w w .  j  a  va2s  .co m
 * @param tx The transaction which needs to be passed to the Scan instance. This transaction is be used by
 *           coprocessors to filter out the data corresonding to the invalid transactions .
 * @param tableName Name of the table which need to be exported as HFiles.
 * @return the configured job
 * @throws IOException
 */
public Job createSubmittableJob(Transaction tx, String tableName) throws IOException {

    Job job = Job.getInstance(hConf, "HBaseTableExporter");

    job.setJarByClass(HBaseTableExporter.class);
    Scan scan = new Scan();
    scan.setCacheBlocks(false);
    // Set the transaction attribute for the scan.
    scan.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, new TransactionCodec().encode(tx));
    job.setNumReduceTasks(0);

    TableMapReduceUtil.initTableMapperJob(tableName, scan, KeyValueImporter.class, null, null, job);

    FileSystem fs = FileSystem.get(hConf);
    Random rand = new Random();
    Path root = new Path(fs.getWorkingDirectory(), "hbasetableexporter");
    fs.mkdirs(root);
    while (true) {
        bulkloadDir = new Path(root, "" + rand.nextLong());
        if (!fs.exists(bulkloadDir)) {
            break;
        }
    }

    HFileOutputFormat2.setOutputPath(job, bulkloadDir);
    HTable hTable = new HTable(hConf, tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, hTable);

    return job;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);

    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();

    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    Set<String> relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);

            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path,
                        relativePath);/*w  ww  . ja v  a  2s .  c  o m*/
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);

            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            Path finalPath = new Path(finalDir, fileName);
            if (fs.exists(finalPath)) {
                throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }

    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }

    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(),
            PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);

    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }

    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }

    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);

    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}