Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.RecordWriterManager.java

License:Apache License

public void commitOldFiles(FileSystem fs) throws IOException, ELEvalException {
    // if getLastBatchTime() is zero it means we never run, nothing to commit
    if (context.getLastBatchTime() > 0) {
        for (String glob : getGlobs()) {
            LOG.debug("Looking for uncommitted files using glob '{}'", glob);
            FileStatus[] globStatus = fs.globStatus(new Path(glob));
            if (globStatus != null) {
                for (FileStatus status : globStatus) {
                    LOG.debug("Found uncommitted file '{}'", status.getPath());
                    renameToFinalName(fs, status.getPath());
                }/*from  w w w  . j  a v  a2  s.c  om*/
            }
        }
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testGetGlobsAndCommitOldFiles() throws Exception {
    Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
    calendar.add(Calendar.HOUR, -2);
    Date lastBatch = calendar.getTime();
    ContextInfoCreator.setLastBatch(targetContext, lastBatch.getTime());

    calendar.add(Calendar.HOUR, -1);
    Date beforeLastBatchWithinCutOff = calendar.getTime();

    calendar.add(Calendar.DATE, -1);
    Date beforeLastBatchOutsideCutOff = calendar.getTime();

    calendar.add(Calendar.DATE, 2);
    Date future = calendar.getTime();

    File testDir = new File("target", UUID.randomUUID().toString()).getAbsoluteFile();
    Assert.assertTrue(testDir.mkdirs());

    // using 1 hour cutoff
    RecordWriterManager mgr = getRecordWriterManager(
            testDir.getAbsolutePath() + "/${YY()}_${MM()}_${DD()}_${hh()}/${record:value('/')}", 3600);

    //this one should not show up when globing
    String f1 = createTempFile(mgr, beforeLastBatchOutsideCutOff, "a");

    //all this should show up when globing
    String f2 = createTempFile(mgr, beforeLastBatchWithinCutOff, "b");
    String f3 = createTempFile(mgr, beforeLastBatchWithinCutOff, "c");
    String f4 = createTempFile(mgr, lastBatch, "d");

    //this one should not show up when globing
    String f5 = createTempFile(mgr, future, "e");

    Set<String> expected = ImmutableSet.of(f2, f3, f4);

    Set<String> got = new HashSet<>();
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    FileSystem fs = FileSystem.get(uri, conf);

    // verifying getGlobs() returned are within the search boundaries
    List<String> globs = mgr.getGlobs();
    for (String glob : globs) {
        FileStatus[] status = fs.globStatus(new Path("file://" + glob));
        for (FileStatus s : status) {
            got.add(s.getPath().toString().substring("file:".length()));
        }/*from   www  . j av a  2 s.  c  om*/
    }
    Assert.assertEquals(expected, got);

    // committing all temps within search boundaries
    mgr.commitOldFiles(fs);

    // verifying there are not temps within search boundaries after committing
    for (String glob : globs) {
        FileStatus[] status = fs.globStatus(new Path("file://" + glob));
        for (FileStatus s : status) {
            Assert.fail();
        }
    }

    // verifying temps outside boundaries are still there
    Assert.assertTrue(new File(f1).exists());
    Assert.assertTrue(new File(f5).exists());

}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * List the statuses of the files/directories in the given path if the path
 * is a directory./*www.ja v  a2 s  .  c  o m*/
 * 
 * @param dfs
 *            handle of {@link FileSystem}
 * 
 * @param srcpath
 *            Path in {@link FileSystem}
 * 
 * @param isGlob
 *            need to use file pattern
 * 
 * @return all {@link Path} in srcpath
 * 
 * @throws IOException 
 * 
 * */
public static List<Path> listDir(FileSystem dfs, Path srcpath, boolean isGlob) throws IOException {
    List<Path> list = new ArrayList<Path>();
    FileStatus[] status = null;
    if (isGlob) {
        status = dfs.globStatus(srcpath);
    } else {
        status = dfs.listStatus(srcpath);
    }
    if (status != null) {
        for (FileStatus state : status) {
            list.add(state.getPath());
        }
    }

    return list;
}

From source file:com.tfm.utad.reducerdata.ReducerDataPig.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-pig");
    Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataPig");
    job.setJarByClass(ReducerDataPig.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataPigMapper.class);
    job.setReducerClass(ReducerDataPigReducer.class);
    job.setNumReduceTasks(1);//from  w w w .  ja v a2  s.  c  o m

    // Specify key / value
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(ReducerPigKey.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-vertica");
    Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataVertica");
    job.setJarByClass(ReducerDataVertica.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataVerticaMapper.class);
    job.setReducerClass(ReducerDataVerticaReducer.class);
    job.setNumReduceTasks(1);/* ww  w.ja  v  a2s.  co m*/

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReducerVerticaValue.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.sqoopdata.SqoopVerticaDB.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(config.get(INPUT_DIRECTORY) + "/*/part-r*");
    FileStatus[] files = fs.globStatus(filesPath);
    for (FileStatus fStatus : files) {
        LOG.info("Path name:" + fStatus.getPath());
        Long minID = getID();//  www  .  ja v  a  2  s .  c o m
        int output = sqoopHDFStoVerticaDB(fStatus.getPath(), conf);
        if (output == 0) {
            LOG.info("Removing directory in path:" + fStatus.getPath().getParent());
            fs.delete(fStatus.getPath().getParent(), true);
            Long maxID = getID();
            sendDataToCartoDB(minID, maxID);
        } else {
            LOG.error("Sqoop FAILED exec file:" + fStatus.getPath()
                    + ".Please, contact the system administrator.");
        }
    }
}

From source file:com.turn.camino.Camino.java

License:Open Source License

/**
 * Materialize path//from   w w  w  .java  2s . c om
 *
 * Converts a path or path pattern into zero or more actual paths
 *
 * @param value rendered value of path
 * @param fileSystem file system
 * @return path status
 * @throws IOException
 */
protected List<PathDetail> materializePath(String value, FileSystem fileSystem) throws IOException {

    // using value to find path
    FileStatus[] fss = fileSystem.globStatus(new org.apache.hadoop.fs.Path(value));

    // path doesn't exist
    if (fss == null || fss.length == 0) {
        return Collections.emptyList();
    }

    // found match(es)
    List<PathDetail> pathDetails = Lists.newArrayListWithExpectedSize(fss.length);
    for (FileStatus fs : fss) {
        PathDetail pathDetail = new PathDetail(fs.getPath().toString(), fs.isDirectory(), fs.getLen(),
                fs.getModificationTime());
        pathDetails.add(pathDetail);
    }

    // return path details
    return pathDetails;
}

From source file:com.turn.camino.CaminoTest.java

License:Open Source License

/**
 * Test materializing a path/*  www  .  j a v a 2  s .  c o  m*/
 *
 * @throws IOException
 */
@Test
public void testMaterializePath() throws IOException {

    long blockSize = 256L * 1024 * 1024;
    long now = System.currentTimeMillis();
    FileSystem fileSystem = mock(FileSystem.class);

    // path that results in one single file
    String pathValue1 = "/foo/bar";
    org.apache.hadoop.fs.Path path1 = new org.apache.hadoop.fs.Path(pathValue1);
    FileStatus[] fss1 = new FileStatus[] { new FileStatus(15000, false, 3, blockSize, now - 10000, path1) };
    when(fileSystem.globStatus(path1)).thenReturn(fss1);
    List<PathDetail> pathDetails1 = camino.materializePath(pathValue1, fileSystem);
    assertNotNull(pathDetails1);
    assertEquals(pathDetails1.size(), 1);
    assertEquals(pathDetails1.get(0).getLastModifiedTime(), fss1[0].getModificationTime());
    assertEquals(pathDetails1.get(0).getLength(), fss1[0].getLen());
    assertEquals(pathDetails1.get(0).isDirectory(), fss1[0].isDirectory());
    assertEquals(pathDetails1.get(0).getPathValue(), pathValue1);

    // path that results in no file
    String pathValue2 = "/foo/baz";
    org.apache.hadoop.fs.Path path2 = new org.apache.hadoop.fs.Path(pathValue2);
    when(fileSystem.globStatus(path2)).thenReturn(new FileStatus[] {});
    List<PathDetail> pathDetails2 = camino.materializePath(pathValue2, fileSystem);
    assertNotNull(pathDetails2);
    assertEquals(pathDetails2.size(), 0);

    // path whose parent doesn't exist (so globStatus returns null)
    String pathValue3 = "/goo/bao";
    org.apache.hadoop.fs.Path path3 = new org.apache.hadoop.fs.Path(pathValue3);
    when(fileSystem.globStatus(path3)).thenReturn(null);
    List<PathDetail> pathDetails3 = camino.materializePath(pathValue3, fileSystem);
    assertNotNull(pathDetails3);
    assertEquals(pathDetails3.size(), 0);

    // path that returns multiple files
    String pathValue4 = "/foo/bub_*";
    org.apache.hadoop.fs.Path path4 = new org.apache.hadoop.fs.Path(pathValue4);
    FileStatus[] fss4 = new FileStatus[] {
            new FileStatus(15000, false, 3, blockSize, now - 10000,
                    new org.apache.hadoop.fs.Path("/foo/bub_1")),
            new FileStatus(24000, false, 3, blockSize, now - 15000,
                    new org.apache.hadoop.fs.Path("/foo/bub_2")) };
    when(fileSystem.globStatus(path4)).thenReturn(fss4);
    List<PathDetail> pathDetails4 = camino.materializePath(pathValue4, fileSystem);
    assertNotNull(pathDetails4);
    assertEquals(pathDetails4.size(), 2);
    assertEquals(pathDetails4.get(0).getLastModifiedTime(), fss4[0].getModificationTime());
    assertEquals(pathDetails4.get(0).getLength(), fss4[0].getLen());
    assertEquals(pathDetails4.get(0).isDirectory(), fss4[0].isDirectory());
    assertEquals(pathDetails4.get(0).getPathValue(), "/foo/bub_1");
    assertEquals(pathDetails4.get(1).getLastModifiedTime(), fss4[1].getModificationTime());
    assertEquals(pathDetails4.get(1).getLength(), fss4[1].getLen());
    assertEquals(pathDetails4.get(1).isDirectory(), fss4[1].isDirectory());
    assertEquals(pathDetails4.get(1).getPathValue(), "/foo/bub_2");
}

From source file:com.turn.camino.CaminoTest.java

License:Open Source License

/**
 * Test processPathMetrics/*from  w ww. j av  a2 s .  co  m*/
 *
 * @throws InvalidNameException
 * @throws WrongTypeException
 * @throws RenderException
 * @throws IOException
 */
@Test
public void testProcessPathMetrics() throws InvalidNameException, WrongTypeException, RenderException,
        IOException, ExecutionException, InterruptedException {

    // create test environment
    long now = System.currentTimeMillis();
    double metricValue = 123456;
    Env env = mock(Env.class);
    Context context = mockGlobalContext(env);
    mockChildContext(context, context, env);

    // mock renderer
    Renderer renderer = mock(Renderer.class);
    mockMetricFunction(renderer, "age", metricValue);
    mockMetricFunction(renderer, "size", metricValue);
    mockMetricFunction(renderer, "count", metricValue);
    when(renderer.render(eq("big_data"), any(Context.class))).thenReturn("big_data");
    when(renderer.render(eq("/app/big_data"), any(Context.class))).thenReturn("/app/big_data");
    List<Path> paths = of(new Path("big_data", "/app/big_data"));

    // mock file system
    FileSystem fileSystem = mockFileSystem(env);
    org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path("/app/big_data");
    when(fileSystem.globStatus(hadoopPath)).thenReturn(
            new FileStatus[] { new FileStatus(15000, false, 3, 256 * 1024 * 1024, now - 10000, hadoopPath) });

    // process path metrics
    List<Future<PathMetrics>> futures = Lists.newLinkedList();
    camino.processPathMetrics(paths, renderer, context, executorService, futures);

    // check that path status was resolved correctly
    PathMetrics pathMetrics = futures.get(0).get();
    PathStatus pathStatus = pathMetrics.getPathStatus();
    assertEquals(pathStatus.getName(), "big_data");
    assertEquals(pathStatus.getValue(), "/app/big_data");
    assertEquals(pathStatus.getPathDetails().size(), 1);
    assertEquals(pathStatus.getPathDetails().get(0).getPathValue(), "/app/big_data");
    assertFalse(pathStatus.getPathDetails().get(0).isDirectory());
    assertEquals(pathStatus.getPathDetails().get(0).getLength(), 15000);
    assertEquals(pathStatus.getPathDetails().get(0).getLastModifiedTime(), now - 10000);

    // check that metric data is expected
    List<MetricDatum> metricData = pathMetrics.getMetricData();
    assertEquals(metricData.size(), 3);
    assertEquals(metricData.get(0).getMetricId().getFullName(), "big_data.age");
    assertEquals(metricData.get(0).getValue(), metricValue, EPSILON);
    assertEquals(metricData.get(1).getMetricId().getFullName(), "big_data.size");
    assertEquals(metricData.get(1).getValue(), metricValue, EPSILON);
    assertEquals(metricData.get(2).getMetricId().getFullName(), "big_data.count");
    assertEquals(metricData.get(2).getValue(), metricValue, EPSILON);
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java

License:Apache License

private void writeIndexDescriptors(ETwinIndexDescriptor ETwinIndexDescriptor) throws IOException {
    Configuration conf = getConf();

    FileSystem fs = (new Path(IndexConfig.index.get()).getFileSystem(conf));

    FileStatus[] fileStats = fs.globStatus(new Path(IndexConfig.index.get(), "*"));

    // We write one indexDescriptor per generated index segment.
    // Something to consider: right now it's a straight-up serialized Thrift object.
    // Would it be better to do the LzoBase64Line thing, so that we can apply our tools?
    // or extend the tools?
    for (int i = 0; i < fileStats.length; i++) {
        ETwinIndexDescriptor.setIndexPart(i);
        FileStatus stat = fileStats[i];/*from   w  w  w.  ja v  a 2  s  .co  m*/
        Path idxPath = new Path(stat.getPath().getParent(), "_" + stat.getPath().getName() + ".indexmeta");
        FSDataOutputStream os = fs.create(idxPath, true);
        @SuppressWarnings("unchecked")
        ThriftWritable<ETwinIndexDescriptor> writable = (ThriftWritable<ETwinIndexDescriptor>) ThriftWritable
                .newInstance(ETwinIndexDescriptor.getClass());
        writable.set(ETwinIndexDescriptor);
        writable.write(os);
        os.close();
    }

}