List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.RecordWriterManager.java
License:Apache License
public void commitOldFiles(FileSystem fs) throws IOException, ELEvalException { // if getLastBatchTime() is zero it means we never run, nothing to commit if (context.getLastBatchTime() > 0) { for (String glob : getGlobs()) { LOG.debug("Looking for uncommitted files using glob '{}'", glob); FileStatus[] globStatus = fs.globStatus(new Path(glob)); if (globStatus != null) { for (FileStatus status : globStatus) { LOG.debug("Found uncommitted file '{}'", status.getPath()); renameToFinalName(fs, status.getPath()); }/*from w w w . j a v a2 s.c om*/ } } } }
From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java
License:Apache License
@Test public void testGetGlobsAndCommitOldFiles() throws Exception { Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC")); calendar.add(Calendar.HOUR, -2); Date lastBatch = calendar.getTime(); ContextInfoCreator.setLastBatch(targetContext, lastBatch.getTime()); calendar.add(Calendar.HOUR, -1); Date beforeLastBatchWithinCutOff = calendar.getTime(); calendar.add(Calendar.DATE, -1); Date beforeLastBatchOutsideCutOff = calendar.getTime(); calendar.add(Calendar.DATE, 2); Date future = calendar.getTime(); File testDir = new File("target", UUID.randomUUID().toString()).getAbsoluteFile(); Assert.assertTrue(testDir.mkdirs()); // using 1 hour cutoff RecordWriterManager mgr = getRecordWriterManager( testDir.getAbsolutePath() + "/${YY()}_${MM()}_${DD()}_${hh()}/${record:value('/')}", 3600); //this one should not show up when globing String f1 = createTempFile(mgr, beforeLastBatchOutsideCutOff, "a"); //all this should show up when globing String f2 = createTempFile(mgr, beforeLastBatchWithinCutOff, "b"); String f3 = createTempFile(mgr, beforeLastBatchWithinCutOff, "c"); String f4 = createTempFile(mgr, lastBatch, "d"); //this one should not show up when globing String f5 = createTempFile(mgr, future, "e"); Set<String> expected = ImmutableSet.of(f2, f3, f4); Set<String> got = new HashSet<>(); URI uri = new URI("file:///"); Configuration conf = new HdfsConfiguration(); FileSystem fs = FileSystem.get(uri, conf); // verifying getGlobs() returned are within the search boundaries List<String> globs = mgr.getGlobs(); for (String glob : globs) { FileStatus[] status = fs.globStatus(new Path("file://" + glob)); for (FileStatus s : status) { got.add(s.getPath().toString().substring("file:".length())); }/*from www . j av a 2 s. c om*/ } Assert.assertEquals(expected, got); // committing all temps within search boundaries mgr.commitOldFiles(fs); // verifying there are not temps within search boundaries after committing for (String glob : globs) { FileStatus[] status = fs.globStatus(new Path("file://" + glob)); for (FileStatus s : status) { Assert.fail(); } } // verifying temps outside boundaries are still there Assert.assertTrue(new File(f1).exists()); Assert.assertTrue(new File(f5).exists()); }
From source file:com.taobao.datax.plugins.common.DFSUtils.java
License:Open Source License
/** * List the statuses of the files/directories in the given path if the path * is a directory./*www.ja v a2 s . c o m*/ * * @param dfs * handle of {@link FileSystem} * * @param srcpath * Path in {@link FileSystem} * * @param isGlob * need to use file pattern * * @return all {@link Path} in srcpath * * @throws IOException * * */ public static List<Path> listDir(FileSystem dfs, Path srcpath, boolean isGlob) throws IOException { List<Path> list = new ArrayList<Path>(); FileStatus[] status = null; if (isGlob) { status = dfs.globStatus(srcpath); } else { status = dfs.listStatus(srcpath); } if (status != null) { for (FileStatus state : status) { list.add(state.getPath()); } } return list; }
From source file:com.tfm.utad.reducerdata.ReducerDataPig.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-pig"); Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataPig"); job.setJarByClass(ReducerDataPig.class); // Setup MapReduce job.setMapperClass(ReducerDataPigMapper.class); job.setReducerClass(ReducerDataPigReducer.class); job.setNumReduceTasks(1);//from w w w . ja v a2 s. c o m // Specify key / value job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(ReducerPigKey.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-vertica"); Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataVertica"); job.setJarByClass(ReducerDataVertica.class); // Setup MapReduce job.setMapperClass(ReducerDataVerticaMapper.class); job.setReducerClass(ReducerDataVerticaReducer.class); job.setNumReduceTasks(1);/* ww w.ja v a2s. co m*/ // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReducerVerticaValue.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tfm.utad.sqoopdata.SqoopVerticaDB.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(config.get(INPUT_DIRECTORY) + "/*/part-r*"); FileStatus[] files = fs.globStatus(filesPath); for (FileStatus fStatus : files) { LOG.info("Path name:" + fStatus.getPath()); Long minID = getID();// www . ja v a 2 s . c o m int output = sqoopHDFStoVerticaDB(fStatus.getPath(), conf); if (output == 0) { LOG.info("Removing directory in path:" + fStatus.getPath().getParent()); fs.delete(fStatus.getPath().getParent(), true); Long maxID = getID(); sendDataToCartoDB(minID, maxID); } else { LOG.error("Sqoop FAILED exec file:" + fStatus.getPath() + ".Please, contact the system administrator."); } } }
From source file:com.turn.camino.Camino.java
License:Open Source License
/** * Materialize path//from w w w .java 2s . c om * * Converts a path or path pattern into zero or more actual paths * * @param value rendered value of path * @param fileSystem file system * @return path status * @throws IOException */ protected List<PathDetail> materializePath(String value, FileSystem fileSystem) throws IOException { // using value to find path FileStatus[] fss = fileSystem.globStatus(new org.apache.hadoop.fs.Path(value)); // path doesn't exist if (fss == null || fss.length == 0) { return Collections.emptyList(); } // found match(es) List<PathDetail> pathDetails = Lists.newArrayListWithExpectedSize(fss.length); for (FileStatus fs : fss) { PathDetail pathDetail = new PathDetail(fs.getPath().toString(), fs.isDirectory(), fs.getLen(), fs.getModificationTime()); pathDetails.add(pathDetail); } // return path details return pathDetails; }
From source file:com.turn.camino.CaminoTest.java
License:Open Source License
/** * Test materializing a path/* www . j a v a 2 s . c o m*/ * * @throws IOException */ @Test public void testMaterializePath() throws IOException { long blockSize = 256L * 1024 * 1024; long now = System.currentTimeMillis(); FileSystem fileSystem = mock(FileSystem.class); // path that results in one single file String pathValue1 = "/foo/bar"; org.apache.hadoop.fs.Path path1 = new org.apache.hadoop.fs.Path(pathValue1); FileStatus[] fss1 = new FileStatus[] { new FileStatus(15000, false, 3, blockSize, now - 10000, path1) }; when(fileSystem.globStatus(path1)).thenReturn(fss1); List<PathDetail> pathDetails1 = camino.materializePath(pathValue1, fileSystem); assertNotNull(pathDetails1); assertEquals(pathDetails1.size(), 1); assertEquals(pathDetails1.get(0).getLastModifiedTime(), fss1[0].getModificationTime()); assertEquals(pathDetails1.get(0).getLength(), fss1[0].getLen()); assertEquals(pathDetails1.get(0).isDirectory(), fss1[0].isDirectory()); assertEquals(pathDetails1.get(0).getPathValue(), pathValue1); // path that results in no file String pathValue2 = "/foo/baz"; org.apache.hadoop.fs.Path path2 = new org.apache.hadoop.fs.Path(pathValue2); when(fileSystem.globStatus(path2)).thenReturn(new FileStatus[] {}); List<PathDetail> pathDetails2 = camino.materializePath(pathValue2, fileSystem); assertNotNull(pathDetails2); assertEquals(pathDetails2.size(), 0); // path whose parent doesn't exist (so globStatus returns null) String pathValue3 = "/goo/bao"; org.apache.hadoop.fs.Path path3 = new org.apache.hadoop.fs.Path(pathValue3); when(fileSystem.globStatus(path3)).thenReturn(null); List<PathDetail> pathDetails3 = camino.materializePath(pathValue3, fileSystem); assertNotNull(pathDetails3); assertEquals(pathDetails3.size(), 0); // path that returns multiple files String pathValue4 = "/foo/bub_*"; org.apache.hadoop.fs.Path path4 = new org.apache.hadoop.fs.Path(pathValue4); FileStatus[] fss4 = new FileStatus[] { new FileStatus(15000, false, 3, blockSize, now - 10000, new org.apache.hadoop.fs.Path("/foo/bub_1")), new FileStatus(24000, false, 3, blockSize, now - 15000, new org.apache.hadoop.fs.Path("/foo/bub_2")) }; when(fileSystem.globStatus(path4)).thenReturn(fss4); List<PathDetail> pathDetails4 = camino.materializePath(pathValue4, fileSystem); assertNotNull(pathDetails4); assertEquals(pathDetails4.size(), 2); assertEquals(pathDetails4.get(0).getLastModifiedTime(), fss4[0].getModificationTime()); assertEquals(pathDetails4.get(0).getLength(), fss4[0].getLen()); assertEquals(pathDetails4.get(0).isDirectory(), fss4[0].isDirectory()); assertEquals(pathDetails4.get(0).getPathValue(), "/foo/bub_1"); assertEquals(pathDetails4.get(1).getLastModifiedTime(), fss4[1].getModificationTime()); assertEquals(pathDetails4.get(1).getLength(), fss4[1].getLen()); assertEquals(pathDetails4.get(1).isDirectory(), fss4[1].isDirectory()); assertEquals(pathDetails4.get(1).getPathValue(), "/foo/bub_2"); }
From source file:com.turn.camino.CaminoTest.java
License:Open Source License
/** * Test processPathMetrics/*from w ww. j av a2 s . co m*/ * * @throws InvalidNameException * @throws WrongTypeException * @throws RenderException * @throws IOException */ @Test public void testProcessPathMetrics() throws InvalidNameException, WrongTypeException, RenderException, IOException, ExecutionException, InterruptedException { // create test environment long now = System.currentTimeMillis(); double metricValue = 123456; Env env = mock(Env.class); Context context = mockGlobalContext(env); mockChildContext(context, context, env); // mock renderer Renderer renderer = mock(Renderer.class); mockMetricFunction(renderer, "age", metricValue); mockMetricFunction(renderer, "size", metricValue); mockMetricFunction(renderer, "count", metricValue); when(renderer.render(eq("big_data"), any(Context.class))).thenReturn("big_data"); when(renderer.render(eq("/app/big_data"), any(Context.class))).thenReturn("/app/big_data"); List<Path> paths = of(new Path("big_data", "/app/big_data")); // mock file system FileSystem fileSystem = mockFileSystem(env); org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path("/app/big_data"); when(fileSystem.globStatus(hadoopPath)).thenReturn( new FileStatus[] { new FileStatus(15000, false, 3, 256 * 1024 * 1024, now - 10000, hadoopPath) }); // process path metrics List<Future<PathMetrics>> futures = Lists.newLinkedList(); camino.processPathMetrics(paths, renderer, context, executorService, futures); // check that path status was resolved correctly PathMetrics pathMetrics = futures.get(0).get(); PathStatus pathStatus = pathMetrics.getPathStatus(); assertEquals(pathStatus.getName(), "big_data"); assertEquals(pathStatus.getValue(), "/app/big_data"); assertEquals(pathStatus.getPathDetails().size(), 1); assertEquals(pathStatus.getPathDetails().get(0).getPathValue(), "/app/big_data"); assertFalse(pathStatus.getPathDetails().get(0).isDirectory()); assertEquals(pathStatus.getPathDetails().get(0).getLength(), 15000); assertEquals(pathStatus.getPathDetails().get(0).getLastModifiedTime(), now - 10000); // check that metric data is expected List<MetricDatum> metricData = pathMetrics.getMetricData(); assertEquals(metricData.size(), 3); assertEquals(metricData.get(0).getMetricId().getFullName(), "big_data.age"); assertEquals(metricData.get(0).getValue(), metricValue, EPSILON); assertEquals(metricData.get(1).getMetricId().getFullName(), "big_data.size"); assertEquals(metricData.get(1).getValue(), metricValue, EPSILON); assertEquals(metricData.get(2).getMetricId().getFullName(), "big_data.count"); assertEquals(metricData.get(2).getValue(), metricValue, EPSILON); }
From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java
License:Apache License
private void writeIndexDescriptors(ETwinIndexDescriptor ETwinIndexDescriptor) throws IOException { Configuration conf = getConf(); FileSystem fs = (new Path(IndexConfig.index.get()).getFileSystem(conf)); FileStatus[] fileStats = fs.globStatus(new Path(IndexConfig.index.get(), "*")); // We write one indexDescriptor per generated index segment. // Something to consider: right now it's a straight-up serialized Thrift object. // Would it be better to do the LzoBase64Line thing, so that we can apply our tools? // or extend the tools? for (int i = 0; i < fileStats.length; i++) { ETwinIndexDescriptor.setIndexPart(i); FileStatus stat = fileStats[i];/*from w w w. ja v a 2 s .co m*/ Path idxPath = new Path(stat.getPath().getParent(), "_" + stat.getPath().getName() + ".indexmeta"); FSDataOutputStream os = fs.create(idxPath, true); @SuppressWarnings("unchecked") ThriftWritable<ETwinIndexDescriptor> writable = (ThriftWritable<ETwinIndexDescriptor>) ThriftWritable .newInstance(ETwinIndexDescriptor.getClass()); writable.set(ETwinIndexDescriptor); writable.write(os); os.close(); } }