Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.RecordWriterManager.java

License:Apache License

public void commitOldFiles(FileSystem fs) throws IOException, ELEvalException {
    // if getLastBatchTime() is zero it means we never run, nothing to commit
    if (context.getLastBatchTime() > 0) {
        for (String glob : getGlobs()) {
            LOG.debug("Looking for uncommitted files using glob '{}'", glob);
            FileStatus[] globStatus = fs.globStatus(new Path(glob));
            if (globStatus != null) {
                for (FileStatus status : globStatus) {
                    LOG.debug("Found uncommitted file '{}'", status.getPath());
                    renameToFinalName(fs, status.getPath());
                }/*from  w w w  . j  a v  a2  s.c  om*/
            }
        }
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

@Test
public void testGetGlobsAndCommitOldFiles() throws Exception {
    Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
    calendar.add(Calendar.HOUR, -2);
    Date lastBatch = calendar.getTime();
    ContextInfoCreator.setLastBatch(targetContext, lastBatch.getTime());

    calendar.add(Calendar.HOUR, -1);
    Date beforeLastBatchWithinCutOff = calendar.getTime();

    calendar.add(Calendar.DATE, -1);
    Date beforeLastBatchOutsideCutOff = calendar.getTime();

    calendar.add(Calendar.DATE, 2);
    Date future = calendar.getTime();

    File testDir = new File("target", UUID.randomUUID().toString()).getAbsoluteFile();
    Assert.assertTrue(testDir.mkdirs());

    // using 1 hour cutoff
    RecordWriterManager mgr = getRecordWriterManager(
            testDir.getAbsolutePath() + "/${YY()}_${MM()}_${DD()}_${hh()}/${record:value('/')}", 3600);

    //this one should not show up when globing
    String f1 = createTempFile(mgr, beforeLastBatchOutsideCutOff, "a");

    //all this should show up when globing
    String f2 = createTempFile(mgr, beforeLastBatchWithinCutOff, "b");
    String f3 = createTempFile(mgr, beforeLastBatchWithinCutOff, "c");
    String f4 = createTempFile(mgr, lastBatch, "d");

    //this one should not show up when globing
    String f5 = createTempFile(mgr, future, "e");

    Set<String> expected = ImmutableSet.of(f2, f3, f4);

    Set<String> got = new HashSet<>();
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    FileSystem fs = FileSystem.get(uri, conf);

    // verifying getGlobs() returned are within the search boundaries
    List<String> globs = mgr.getGlobs();
    for (String glob : globs) {
        FileStatus[] status = fs.globStatus(new Path("file://" + glob));
        for (FileStatus s : status) {
            got.add(s.getPath().toString().substring("file:".length()));
        }/*from   www  . j av a  2 s.  c  om*/
    }
    Assert.assertEquals(expected, got);

    // committing all temps within search boundaries
    mgr.commitOldFiles(fs);

    // verifying there are not temps within search boundaries after committing
    for (String glob : globs) {
        FileStatus[] status = fs.globStatus(new Path("file://" + glob));
        for (FileStatus s : status) {
            Assert.fail();
        }
    }

    // verifying temps outside boundaries are still there
    Assert.assertTrue(new File(f1).exists());
    Assert.assertTrue(new File(f5).exists());

}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * List the statuses of the files/directories in the given path if the path
 * is a directory./*www.ja v  a2 s  .  c  o m*/
 * 
 * @param dfs
 *            handle of {@link FileSystem}
 * 
 * @param srcpath
 *            Path in {@link FileSystem}
 * 
 * @param isGlob
 *            need to use file pattern
 * 
 * @return all {@link Path} in srcpath
 * 
 * @throws IOException 
 * 
 * */
public static List<Path> listDir(FileSystem dfs, Path srcpath, boolean isGlob) throws IOException {
    List<Path> list = new ArrayList<Path>();
    FileStatus[] status = null;
    if (isGlob) {
        status = dfs.globStatus(srcpath);
    } else {
        status = dfs.listStatus(srcpath);
    }
    if (status != null) {
        for (FileStatus state : status) {
            list.add(state.getPath());
        }
    }

    return list;
}

From source file:com.tfm.utad.reducerdata.ReducerDataPig.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-pig");
    Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataPig");
    job.setJarByClass(ReducerDataPig.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataPigMapper.class);
    job.setReducerClass(ReducerDataPigReducer.class);
    job.setNumReduceTasks(1);//from  w w w .  ja v a2  s.  c  o m

    // Specify key / value
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(ReducerPigKey.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-vertica");
    Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataVertica");
    job.setJarByClass(ReducerDataVertica.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataVerticaMapper.class);
    job.setReducerClass(ReducerDataVerticaReducer.class);
    job.setNumReduceTasks(1);/* ww  w.ja  v  a2s.  co m*/

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReducerVerticaValue.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.sqoopdata.SqoopVerticaDB.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(config.get(INPUT_DIRECTORY) + "/*/part-r*");
    FileStatus[] files = fs.globStatus(filesPath);
    for (FileStatus fStatus : files) {
        LOG.info("Path name:" + fStatus.getPath());
        Long minID = getID();//  www  .  ja v  a  2  s .  c o m
        int output = sqoopHDFStoVerticaDB(fStatus.getPath(), conf);
        if (output == 0) {
            LOG.info("Removing directory in path:" + fStatus.getPath().getParent());
            fs.delete(fStatus.getPath().getParent(), true);
            Long maxID = getID();
            sendDataToCartoDB(minID, maxID);
        } else {
            LOG.error("Sqoop FAILED exec file:" + fStatus.getPath()
                    + ".Please, contact the system administrator.");
        }
    }
}

From source file:com.turn.camino.Camino.java

License:Open Source License

/**
 * Materialize path//from   w w  w  .java  2s . c om
 *
 * Converts a path or path pattern into zero or more actual paths
 *
 * @param value rendered value of path
 * @param fileSystem file system
 * @return path status
 * @throws IOException
 */
protected List<PathDetail> materializePath(String value, FileSystem fileSystem) throws IOException {

    // using value to find path
    FileStatus[] fss = fileSystem.globStatus(new org.apache.hadoop.fs.Path(value));

    // path doesn't exist
    if (fss == null || fss.length == 0) {
        return Collections.emptyList();
    }

    // found match(es)
    List<PathDetail> pathDetails = Lists.newArrayListWithExpectedSize(fss.length);
    for (FileStatus fs : fss) {
        PathDetail pathDetail = new PathDetail(fs.getPath().toString(), fs.isDirectory(), fs.getLen(),
                fs.getModificationTime());
        pathDetails.add(pathDetail);
    }

    // return path details
    return pathDetails;
}

From source file:com.turn.camino.CaminoTest.java

License:Open Source License

/**
 * Test materializing a path/*  www  .  j a v a 2  s .  c o  m*/
 *
 * @throws IOException
 */
@Test
public void testMaterializePath() throws IOException {

    long blockSize = 256L * 1024 * 1024;
    long now = System.currentTimeMillis();
    FileSystem fileSystem = mock(FileSystem.class);

    // path that results in one single file
    String pathValue1 = "/foo/bar";
    org.apache.hadoop.fs.Path path1 = new org.apache.hadoop.fs.Path(pathValue1);
    FileStatus[] fss1 = new FileStatus[] { new FileStatus(15000, false, 3, blockSize, now - 10000, path1) };
    when(fileSystem.globStatus(path1)).thenReturn(fss1);
    List<PathDetail> pathDetails1 = camino.materializePath(pathValue1, fileSystem);
    assertNotNull(pathDetails1);
    assertEquals(pathDetails1.size(), 1);
    assertEquals(pathDetails1.get(0).getLastModifiedTime(), fss1[0].getModificationTime());
    assertEquals(pathDetails1.get(0).getLength(), fss1[0].getLen());
    assertEquals(pathDetails1.get(0).isDirectory(), fss1[0].isDirectory());
    assertEquals(pathDetails1.get(0).getPathValue(), pathValue1);

    // path that results in no file
    String pathValue2 = "/foo/baz";
    org.apache.hadoop.fs.Path path2 = new org.apache.hadoop.fs.Path(pathValue2);
    when(fileSystem.globStatus(path2)).thenReturn(new FileStatus[] {});
    List<PathDetail> pathDetails2 = camino.materializePath(pathValue2, fileSystem);
    assertNotNull(pathDetails2);
    assertEquals(pathDetails2.size(), 0);

    // path whose parent doesn't exist (so globStatus returns null)
    String pathValue3 = "/goo/bao";
    org.apache.hadoop.fs.Path path3 = new org.apache.hadoop.fs.Path(pathValue3);
    when(fileSystem.globStatus(path3)).thenReturn(null);
    List<PathDetail> pathDetails3 = camino.materializePath(pathValue3, fileSystem);
    assertNotNull(pathDetails3);
    assertEquals(pathDetails3.size(), 0);

    // path that returns multiple files
    String pathValue4 = "/foo/bub_*";
    org.apache.hadoop.fs.Path path4 = new org.apache.hadoop.fs.Path(pathValue4);
    FileStatus[] fss4 = new FileStatus[] {
            new FileStatus(15000, false, 3, blockSize, now - 10000,
                    new org.apache.hadoop.fs.Path("/foo/bub_1")),
            new FileStatus(24000, false, 3, blockSize, now - 15000,
                    new org.apache.hadoop.fs.Path("/foo/bub_2")) };
    when(fileSystem.globStatus(path4)).thenReturn(fss4);
    List<PathDetail> pathDetails4 = camino.materializePath(pathValue4, fileSystem);
    assertNotNull(pathDetails4);
    assertEquals(pathDetails4.size(), 2);
    assertEquals(pathDetails4.get(0).getLastModifiedTime(), fss4[0].getModificationTime());
    assertEquals(pathDetails4.get(0).getLength(), fss4[0].getLen());
    assertEquals(pathDetails4.get(0).isDirectory(), fss4[0].isDirectory());
    assertEquals(pathDetails4.get(0).getPathValue(), "/foo/bub_1");
    assertEquals(pathDetails4.get(1).getLastModifiedTime(), fss4[1].getModificationTime());
    assertEquals(pathDetails4.get(1).getLength(), fss4[1].getLen());
    assertEquals(pathDetails4.get(1).isDirectory(), fss4[1].isDirectory());
    assertEquals(pathDetails4.get(1).getPathValue(), "/foo/bub_2");
}

From source file:com.turn.camino.CaminoTest.java

License:Open Source License

/**
 * Test processPathMetrics/*from  w ww. j av  a2 s .  co  m*/
 *
 * @throws InvalidNameException
 * @throws WrongTypeException
 * @throws RenderException
 * @throws IOException
 */
@Test
public void testProcessPathMetrics() throws InvalidNameException, WrongTypeException, RenderException,
        IOException, ExecutionException, InterruptedException {

    // create test environment
    long now = System.currentTimeMillis();
    double metricValue = 123456;
    Env env = mock(Env.class);
    Context context = mockGlobalContext(env);
    mockChildContext(context, context, env);

    // mock renderer
    Renderer renderer = mock(Renderer.class);
    mockMetricFunction(renderer, "age", metricValue);
    mockMetricFunction(renderer, "size", metricValue);
    mockMetricFunction(renderer, "count", metricValue);
    when(renderer.render(eq("big_data"), any(Context.class))).thenReturn("big_data");
    when(renderer.render(eq("/app/big_data"), any(Context.class))).thenReturn("/app/big_data");
    List<Path> paths = of(new Path("big_data", "/app/big_data"));

    // mock file system
    FileSystem fileSystem = mockFileSystem(env);
    org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path("/app/big_data");
    when(fileSystem.globStatus(hadoopPath)).thenReturn(
            new FileStatus[] { new FileStatus(15000, false, 3, 256 * 1024 * 1024, now - 10000, hadoopPath) });

    // process path metrics
    List<Future<PathMetrics>> futures = Lists.newLinkedList();
    camino.processPathMetrics(paths, renderer, context, executorService, futures);

    // check that path status was resolved correctly
    PathMetrics pathMetrics = futures.get(0).get();
    PathStatus pathStatus = pathMetrics.getPathStatus();
    assertEquals(pathStatus.getName(), "big_data");
    assertEquals(pathStatus.getValue(), "/app/big_data");
    assertEquals(pathStatus.getPathDetails().size(), 1);
    assertEquals(pathStatus.getPathDetails().get(0).getPathValue(), "/app/big_data");
    assertFalse(pathStatus.getPathDetails().get(0).isDirectory());
    assertEquals(pathStatus.getPathDetails().get(0).getLength(), 15000);
    assertEquals(pathStatus.getPathDetails().get(0).getLastModifiedTime(), now - 10000);

    // check that metric data is expected
    List<MetricDatum> metricData = pathMetrics.getMetricData();
    assertEquals(metricData.size(), 3);
    assertEquals(metricData.get(0).getMetricId().getFullName(), "big_data.age");
    assertEquals(metricData.get(0).getValue(), metricValue, EPSILON);
    assertEquals(metricData.get(1).getMetricId().getFullName(), "big_data.size");
    assertEquals(metricData.get(1).getValue(), metricValue, EPSILON);
    assertEquals(metricData.get(2).getMetricId().getFullName(), "big_data.count");
    assertEquals(metricData.get(2).getValue(), metricValue, EPSILON);
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java

License:Apache License

private void writeIndexDescriptors(ETwinIndexDescriptor ETwinIndexDescriptor) throws IOException {
    Configuration conf = getConf();

    FileSystem fs = (new Path(IndexConfig.index.get()).getFileSystem(conf));

    FileStatus[] fileStats = fs.globStatus(new Path(IndexConfig.index.get(), "*"));

    // We write one indexDescriptor per generated index segment.
    // Something to consider: right now it's a straight-up serialized Thrift object.
    // Would it be better to do the LzoBase64Line thing, so that we can apply our tools?
    // or extend the tools?
    for (int i = 0; i < fileStats.length; i++) {
        ETwinIndexDescriptor.setIndexPart(i);
        FileStatus stat = fileStats[i];/*from   w  w  w.  ja v  a 2  s  .co  m*/
        Path idxPath = new Path(stat.getPath().getParent(), "_" + stat.getPath().getName() + ".indexmeta");
        FSDataOutputStream os = fs.create(idxPath, true);
        @SuppressWarnings("unchecked")
        ThriftWritable<ETwinIndexDescriptor> writable = (ThriftWritable<ETwinIndexDescriptor>) ThriftWritable
                .newInstance(ETwinIndexDescriptor.getClass());
        writable.set(ETwinIndexDescriptor);
        writable.write(os);
        os.close();
    }

}