List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(Configuration conf) throws IOException
From source file:co.cask.hydrator.plugin.batch.action.FileAction.java
License:Apache License
@SuppressWarnings("ConstantConditions") @Override//w w w . jav a2s . co m public void run(BatchActionContext context) throws Exception { if (!config.shouldRun(context)) { return; } config.substituteMacros(context); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); FileSystem fileSystem = FileSystem.get(conf); Path[] paths; Path sourcePath = new Path(config.path); if (fileSystem.isDirectory(sourcePath)) { FileStatus[] status = fileSystem.listStatus(sourcePath); paths = FileUtil.stat2Paths(status); } else { paths = new Path[] { sourcePath }; } //get regex pattern for file name filtering. boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern); if (patternSpecified) { regex = Pattern.compile(config.pattern); } switch (config.action.toLowerCase()) { case "delete": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { fileSystem.delete(path, true); } } break; case "move": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { Path targetFileMovePath = new Path(config.targetFolder, path.getName()); fileSystem.rename(path, targetFileMovePath); } } break; case "archive": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { try (FSDataOutputStream archivedStream = fileSystem .create(new Path(config.targetFolder, path.getName() + ".zip")); ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream); FSDataInputStream fdDataInputStream = fileSystem.open(path)) { zipArchivedStream.putNextEntry(new ZipEntry(path.getName())); int length; byte[] buffer = new byte[1024]; while ((length = fdDataInputStream.read(buffer)) > 0) { zipArchivedStream.write(buffer, 0, length); } zipArchivedStream.closeEntry(); } fileSystem.delete(path, true); } } break; default: LOG.warn("No action required on the file."); break; } }
From source file:co.cask.hydrator.plugin.batch.ETLMapReduceTestRun.java
License:Apache License
@Test public void testFiletoMultipleTPFS() throws Exception { String filePath = "file:///tmp/test/text.txt"; String testData = "String for testing purposes."; Path textFile = new Path(filePath); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FSDataOutputStream writeData = fs.create(textFile); writeData.write(testData.getBytes()); writeData.flush();/*from w ww . j av a 2 s .c o m*/ writeData.close(); ETLStage source = new ETLStage("source", new ETLPlugin("File", BatchSource.PLUGIN_TYPE, ImmutableMap.<String, String>builder().put(Constants.Reference.REFERENCE_NAME, "TestFile") .put(Properties.File.FILESYSTEM, "Text").put(Properties.File.PATH, filePath).build(), null)); ETLStage sink1 = new ETLStage("sink1", new ETLPlugin("TPFSAvro", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA, FileBatchSource.DEFAULT_SCHEMA.toString(), Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink1"), null)); ETLStage sink2 = new ETLStage("sink2", new ETLPlugin("TPFSParquet", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA, FileBatchSource.DEFAULT_SCHEMA.toString(), Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink2"), null)); ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink1) .addStage(sink2).addConnection(source.getName(), sink1.getName()) .addConnection(source.getName(), sink2.getName()).build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(ETLBATCH_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "FileToTPFS"); ApplicationManager appManager = deployApplication(appId, appRequest); MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME); mrManager.start(); mrManager.waitForFinish(2, TimeUnit.MINUTES); for (String sinkName : new String[] { "fileSink1", "fileSink2" }) { DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset(sinkName); try (TimePartitionedFileSet fileSet = fileSetManager.get()) { List<GenericRecord> records = readOutput(fileSet, FileBatchSource.DEFAULT_SCHEMA); Assert.assertEquals(1, records.size()); Assert.assertEquals(testData, records.get(0).get("body").toString()); } } }
From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); conf.set(XMLInputFormat.XML_INPUTFORMAT_PATH_NAME, config.path); conf.set(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH, config.nodePath); if (StringUtils.isNotEmpty(config.pattern)) { conf.set(XMLInputFormat.XML_INPUTFORMAT_PATTERN, config.pattern); }/*from w ww . j a v a2 s. co m*/ conf.set(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION, config.actionAfterProcess); if (StringUtils.isNotEmpty(config.targetFolder)) { conf.set(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER, config.targetFolder); } setFileTrackingInfo(context, conf); //Create a temporary directory, in which XMLRecordReader will add file tracking information. fileSystem = FileSystem.get(conf); long startTime = context.getLogicalStartTime(); //Create temp file name using start time to make it unique. String tempDirectory = config.tableName + startTime; tempDirectoryPath = new Path(config.temporaryFolder, tempDirectory); fileSystem.mkdirs(tempDirectoryPath); fileSystem.deleteOnExit(tempDirectoryPath); conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER, tempDirectoryPath.toUri().toString()); XMLInputFormat.setInputPathFilter(job, BatchXMLFileFilter.class); XMLInputFormat.addInputPath(job, new Path(config.path)); context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(XMLInputFormat.class, conf))); }
From source file:co.cask.hydrator.plugin.db.batch.action.VerticaBulkImportAction.java
License:Apache License
@Override public void run(ActionContext context) throws Exception { Object driver = Class.forName("com.vertica.jdbc.Driver").newInstance(); DriverManager.registerDriver((Driver) driver); Preconditions.checkArgument(tableExists(config.tableName), "Table %s does not exist. Please check that the 'tableName' property " + "has been set correctly, and that the connection string %s points to a valid database.", config.tableName, config.connectionString); String copyStatement;//from ww w. ja v a 2s .co m if (config.level.equalsIgnoreCase("basic")) { // COPY tableName FROM STDIN DELIMITER 'delimiter' copyStatement = String.format("COPY %s FROM STDIN DELIMITER '%s'", config.tableName, config.delimiter); } else { copyStatement = config.copyStatement; } LOG.debug("Copy statement is: {}", copyStatement); try { try (Connection connection = DriverManager.getConnection(config.connectionString, config.user, config.password)) { connection.setAutoCommit(false); // run Copy statement VerticaCopyStream stream = new VerticaCopyStream((VerticaConnection) connection, copyStatement); // Keep running count of the number of rejects int totalRejects = 0; // start() starts the stream process, and opens the COPY command. stream.start(); FileSystem fs = FileSystem.get(new Configuration()); List<String> fileList = new ArrayList<>(); FileStatus[] fileStatus; try { fileStatus = fs.listStatus(new Path(config.path)); for (FileStatus fileStat : fileStatus) { fileList.add(fileStat.getPath().toString()); } } catch (FileNotFoundException e) { throw new IllegalArgumentException(String.format(String.format( "Path %s not found on file system. Please provide correct path.", config.path), e)); } if (fileStatus.length <= 0) { LOG.warn("No files available to load into vertica database"); } for (String file : fileList) { Path path = new Path(file); FSDataInputStream inputStream = fs.open(path); // Add stream to the VerticaCopyStream stream.addStream(inputStream); // call execute() to load the newly added stream. You could // add many streams and call execute once to load them all. // Which method you choose depends mainly on whether you want // the ability to check the number of rejections as the load // progresses so you can stop if the number of rejects gets too // high. Also, high numbers of InputStreams could create a // resource issue on your client system. stream.execute(); // Show any rejects from this execution of the stream load // getRejects() returns a List containing the // row numbers of rejected rows. List<Long> rejects = stream.getRejects(); // The size of the list gives you the number of rejected rows. int numRejects = rejects.size(); totalRejects += numRejects; if (config.autoCommit.equalsIgnoreCase("true")) { // Commit the loaded data connection.commit(); } } // Finish closes the COPY command. It returns the number of // rows inserted. long results = stream.finish(); context.getMetrics().gauge("num.of.rows.rejected", totalRejects); context.getMetrics().gauge("num.of.rows.inserted", results); // Commit the loaded data connection.commit(); } } catch (Exception e) { throw new RuntimeException(String.format("Exception while running copy statement %s", copyStatement), e); } finally { DriverManager.deregisterDriver((Driver) driver); } }
From source file:co.cask.hydrator.plugin.HDFSSinkTest.java
License:Apache License
@Before public void beforeTest() throws Exception { // Setup Hadoop Minicluster File baseDir = temporaryFolder.newFolder(); Configuration conf = new Configuration(); conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); dfsCluster = builder.build();//w w w .j a v a 2 s . c o m dfsCluster.waitActive(); fileSystem = FileSystem.get(conf); }
From source file:co.cask.hydrator.plugin.hive.action.HiveExport.java
License:Apache License
@Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException { //validate hive command. For export we only accept Select statements SqlParser parser = SqlParser.create(config.statement); try {/* www . jav a 2 s.c o m*/ SqlNode sqlNode = parser.parseQuery(); if (!(sqlNode instanceof SqlSelect)) { throw new IllegalArgumentException( "Hive Export only uses Select statements. Please provide valid hive " + "select statement."); } } catch (SqlParseException e) { throw new IllegalArgumentException( "Error while parsing select statement. Please provide a valid hive select " + "statement."); } // validate if the directory already exists if (config.overwrite.equalsIgnoreCase("no")) { Configuration configuration = new Configuration(); try { FileSystem fs = FileSystem.get(configuration); if (fs.exists(new Path(config.path))) { throw new IllegalArgumentException( String.format("The path %s already exists. Please either delete that " + "path or provide another path.", config.path)); } } catch (IOException e) { throw new RuntimeException("Exception occurred while doing directory check", e); } } }
From source file:co.cask.tephra.hbase10.coprocessor.TransactionProcessorTest.java
License:Apache License
private HRegion createRegion(String tableName, byte[] family, long ttl) throws IOException { HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName)); HColumnDescriptor cfd = new HColumnDescriptor(family); if (ttl > 0) { cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(ttl)); }//w w w .j av a 2 s . c om cfd.setMaxVersions(10); htd.addFamily(cfd); htd.addCoprocessor(TransactionProcessor.class.getName()); Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), htd.getTableName()); FileSystem fs = FileSystem.get(conf); assertTrue(fs.mkdirs(tablePath)); WALFactory walFactory = new WALFactory(conf, null, tableName + ".hlog"); WAL hLog = walFactory.getWAL(new byte[] { 1 }); HRegionInfo regionInfo = new HRegionInfo(TableName.valueOf(tableName)); HRegionFileSystem regionFS = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tablePath, regionInfo); return new HRegion(regionFS, hLog, conf, htd, new LocalRegionServerServices(conf, ServerName.valueOf(InetAddress.getLocalHost().getHostName(), 0, System.currentTimeMillis()))); }
From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java
License:Apache License
@Test public void testDataJanitorRegionScanner() throws Exception { String tableName = "TestDataJanitorRegionScanner"; byte[] familyBytes = Bytes.toBytes("f"); byte[] columnBytes = Bytes.toBytes("c"); HTableDescriptor htd = new HTableDescriptor(tableName); HColumnDescriptor cfd = new HColumnDescriptor(familyBytes); // with that, all older than upper visibility bound by 3 hours should be expired by TTL logic cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(TimeUnit.HOURS.toMillis(3))); cfd.setMaxVersions(10);/* w w w . j a v a2 s . com*/ htd.addFamily(cfd); htd.addCoprocessor(TransactionProcessor.class.getName()); Path tablePath = new Path("/tmp/" + tableName); Path hlogPath = new Path("/tmp/hlog"); Path oldPath = new Path("/tmp/.oldLogs"); Configuration hConf = conf; FileSystem fs = FileSystem.get(hConf); assertTrue(fs.mkdirs(tablePath)); HLog hlog = new HLog(fs, hlogPath, oldPath, hConf); HRegion region = new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd, new MockRegionServerServices()); try { region.initialize(); TransactionStateCache cache = new TransactionStateCacheSupplier(hConf).get(); LOG.info("Coprocessor is using transaction state: " + cache.getLatestState()); for (int i = 1; i <= 8; i++) { for (int k = 1; k <= i; k++) { Put p = new Put(Bytes.toBytes(i)); p.add(familyBytes, columnBytes, V[k], Bytes.toBytes(V[k])); region.put(p); } } List<KeyValue> results = Lists.newArrayList(); // force a flush to clear the data // during flush, the coprocessor should drop all KeyValues with timestamps in the invalid set LOG.info("Flushing region " + region.getRegionNameAsString()); region.flushcache(); // now a normal scan should only return the valid rows - testing that cleanup works on flush Scan scan = new Scan(); scan.setMaxVersions(10); RegionScanner regionScanner = region.getScanner(scan); // first returned value should be "4" with version "4" results.clear(); assertTrue(regionScanner.next(results)); assertKeyValueMatches(results, 4, new long[] { V[4] }); results.clear(); assertTrue(regionScanner.next(results)); assertKeyValueMatches(results, 5, new long[] { V[4] }); results.clear(); assertTrue(regionScanner.next(results)); assertKeyValueMatches(results, 6, new long[] { V[6], V[4] }); results.clear(); assertTrue(regionScanner.next(results)); assertKeyValueMatches(results, 7, new long[] { V[6], V[4] }); results.clear(); assertFalse(regionScanner.next(results)); assertKeyValueMatches(results, 8, new long[] { V[8], V[6], V[4] }); } finally { region.close(); } }
From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java
License:Apache License
@Test public void testDeleteFiltering() throws Exception { String tableName = "TestDeleteFiltering"; byte[] familyBytes = Bytes.toBytes("f"); byte[] columnBytes = Bytes.toBytes("c"); HTableDescriptor htd = new HTableDescriptor(tableName); HColumnDescriptor cfd = new HColumnDescriptor(familyBytes); cfd.setMaxVersions(10);// ww w.j a va2s . c o m htd.addFamily(cfd); htd.addCoprocessor(TransactionProcessor.class.getName()); Path tablePath = new Path("/tmp/" + tableName); Path hlogPath = new Path("/tmp/hlog-" + tableName); Path oldPath = new Path("/tmp/.oldLogs-" + tableName); Configuration hConf = conf; FileSystem fs = FileSystem.get(hConf); assertTrue(fs.mkdirs(tablePath)); HLog hlog = new HLog(fs, hlogPath, oldPath, hConf); HRegion region = new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd, new MockRegionServerServices()); try { region.initialize(); TransactionStateCache cache = new TransactionStateCacheSupplier(hConf).get(); LOG.info("Coprocessor is using transaction state: " + cache.getLatestState()); byte[] row = Bytes.toBytes(1); for (int i = 4; i < V.length; i++) { if (i != 5) { Put p = new Put(row); p.add(familyBytes, columnBytes, V[i], Bytes.toBytes(V[i])); region.put(p); } } // delete from the third entry back Delete d = new Delete(row, V[5]); region.delete(d, false); List<KeyValue> results = Lists.newArrayList(); // force a flush to clear the data // during flush, we should drop the deleted version, but not the others LOG.info("Flushing region " + region.getRegionNameAsString()); region.flushcache(); // now a normal scan should return row with versions at: V[8], V[6]. // V[7] is invalid and V[5] and prior are deleted. Scan scan = new Scan(); scan.setMaxVersions(10); RegionScanner regionScanner = region.getScanner(scan); // should be only one row assertFalse(regionScanner.next(results)); assertKeyValueMatches(results, 1, new long[] { V[8], V[6] }); } finally { region.close(); } }
From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java
License:Apache License
private HRegion createRegion(String tableName, byte[] family, long ttl) throws IOException { HTableDescriptor htd = new HTableDescriptor(tableName); HColumnDescriptor cfd = new HColumnDescriptor(family); if (ttl > 0) { cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(ttl)); }// ww w . j av a2 s . co m cfd.setMaxVersions(10); htd.addFamily(cfd); htd.addCoprocessor(TransactionProcessor.class.getName()); Path tablePath = new Path("/tmp/" + tableName); Path hlogPath = new Path("/tmp/hlog-" + tableName); Path oldPath = new Path("/tmp/.oldLogs-" + tableName); Configuration hConf = conf; FileSystem fs = FileSystem.get(hConf); assertTrue(fs.mkdirs(tablePath)); HLog hlog = new HLog(fs, hlogPath, oldPath, hConf); return new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd, new MockRegionServerServices()); }