Example usage for org.apache.hadoop.fs FileSystem get

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem get.

Prototype

public static FileSystem get(Configuration conf) throws IOException

Source Link

Document

Returns the configured FileSystem implementation.

Usage

From source file:co.cask.hydrator.plugin.batch.action.FileAction.java

License:Apache License

@SuppressWarnings("ConstantConditions")
@Override//w  w w  . jav a2s  . co m
public void run(BatchActionContext context) throws Exception {
    if (!config.shouldRun(context)) {
        return;
    }
    config.substituteMacros(context);

    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    FileSystem fileSystem = FileSystem.get(conf);
    Path[] paths;
    Path sourcePath = new Path(config.path);
    if (fileSystem.isDirectory(sourcePath)) {
        FileStatus[] status = fileSystem.listStatus(sourcePath);
        paths = FileUtil.stat2Paths(status);
    } else {
        paths = new Path[] { sourcePath };
    }

    //get regex pattern for file name filtering.
    boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern);
    if (patternSpecified) {
        regex = Pattern.compile(config.pattern);
    }

    switch (config.action.toLowerCase()) {
    case "delete":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                fileSystem.delete(path, true);
            }
        }
        break;
    case "move":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                Path targetFileMovePath = new Path(config.targetFolder, path.getName());
                fileSystem.rename(path, targetFileMovePath);
            }
        }
        break;
    case "archive":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                try (FSDataOutputStream archivedStream = fileSystem
                        .create(new Path(config.targetFolder, path.getName() + ".zip"));
                        ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream);
                        FSDataInputStream fdDataInputStream = fileSystem.open(path)) {
                    zipArchivedStream.putNextEntry(new ZipEntry(path.getName()));
                    int length;
                    byte[] buffer = new byte[1024];
                    while ((length = fdDataInputStream.read(buffer)) > 0) {
                        zipArchivedStream.write(buffer, 0, length);
                    }
                    zipArchivedStream.closeEntry();
                }
                fileSystem.delete(path, true);
            }
        }
        break;
    default:
        LOG.warn("No action required on the file.");
        break;
    }
}

From source file:co.cask.hydrator.plugin.batch.ETLMapReduceTestRun.java

License:Apache License

@Test
public void testFiletoMultipleTPFS() throws Exception {
    String filePath = "file:///tmp/test/text.txt";
    String testData = "String for testing purposes.";

    Path textFile = new Path(filePath);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FSDataOutputStream writeData = fs.create(textFile);
    writeData.write(testData.getBytes());
    writeData.flush();/*from  w ww  .  j av a  2 s  .c  o m*/
    writeData.close();

    ETLStage source = new ETLStage("source", new ETLPlugin("File", BatchSource.PLUGIN_TYPE,
            ImmutableMap.<String, String>builder().put(Constants.Reference.REFERENCE_NAME, "TestFile")
                    .put(Properties.File.FILESYSTEM, "Text").put(Properties.File.PATH, filePath).build(),
            null));

    ETLStage sink1 = new ETLStage("sink1",
            new ETLPlugin("TPFSAvro", BatchSink.PLUGIN_TYPE,
                    ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA,
                            FileBatchSource.DEFAULT_SCHEMA.toString(),
                            Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink1"),
                    null));
    ETLStage sink2 = new ETLStage("sink2",
            new ETLPlugin("TPFSParquet", BatchSink.PLUGIN_TYPE,
                    ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA,
                            FileBatchSource.DEFAULT_SCHEMA.toString(),
                            Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink2"),
                    null));

    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink1)
            .addStage(sink2).addConnection(source.getName(), sink1.getName())
            .addConnection(source.getName(), sink2.getName()).build();

    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(ETLBATCH_ARTIFACT, etlConfig);
    Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "FileToTPFS");
    ApplicationManager appManager = deployApplication(appId, appRequest);

    MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME);
    mrManager.start();
    mrManager.waitForFinish(2, TimeUnit.MINUTES);

    for (String sinkName : new String[] { "fileSink1", "fileSink2" }) {
        DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset(sinkName);
        try (TimePartitionedFileSet fileSet = fileSetManager.get()) {
            List<GenericRecord> records = readOutput(fileSet, FileBatchSource.DEFAULT_SCHEMA);
            Assert.assertEquals(1, records.size());
            Assert.assertEquals(testData, records.get(0).get("body").toString());
        }
    }
}

From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PATH_NAME, config.path);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH, config.nodePath);
    if (StringUtils.isNotEmpty(config.pattern)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_PATTERN, config.pattern);
    }/*from   w ww  . j a  v  a2 s.  co  m*/
    conf.set(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION, config.actionAfterProcess);
    if (StringUtils.isNotEmpty(config.targetFolder)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER, config.targetFolder);
    }

    setFileTrackingInfo(context, conf);

    //Create a temporary directory, in which XMLRecordReader will add file tracking information.
    fileSystem = FileSystem.get(conf);
    long startTime = context.getLogicalStartTime();
    //Create temp file name using start time to make it unique.
    String tempDirectory = config.tableName + startTime;
    tempDirectoryPath = new Path(config.temporaryFolder, tempDirectory);
    fileSystem.mkdirs(tempDirectoryPath);
    fileSystem.deleteOnExit(tempDirectoryPath);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER, tempDirectoryPath.toUri().toString());

    XMLInputFormat.setInputPathFilter(job, BatchXMLFileFilter.class);
    XMLInputFormat.addInputPath(job, new Path(config.path));
    context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(XMLInputFormat.class, conf)));
}

From source file:co.cask.hydrator.plugin.db.batch.action.VerticaBulkImportAction.java

License:Apache License

@Override
public void run(ActionContext context) throws Exception {
    Object driver = Class.forName("com.vertica.jdbc.Driver").newInstance();
    DriverManager.registerDriver((Driver) driver);

    Preconditions.checkArgument(tableExists(config.tableName),
            "Table %s does not exist. Please check that the 'tableName' property "
                    + "has been set correctly, and that the connection string %s points to a valid database.",
            config.tableName, config.connectionString);

    String copyStatement;//from ww w. ja  v a 2s  .co m

    if (config.level.equalsIgnoreCase("basic")) {
        // COPY tableName FROM STDIN DELIMITER 'delimiter'
        copyStatement = String.format("COPY %s FROM STDIN DELIMITER '%s'", config.tableName, config.delimiter);
    } else {
        copyStatement = config.copyStatement;
    }

    LOG.debug("Copy statement is: {}", copyStatement);

    try {
        try (Connection connection = DriverManager.getConnection(config.connectionString, config.user,
                config.password)) {
            connection.setAutoCommit(false);
            // run Copy statement
            VerticaCopyStream stream = new VerticaCopyStream((VerticaConnection) connection, copyStatement);
            // Keep running count of the number of rejects
            int totalRejects = 0;

            // start() starts the stream process, and opens the COPY command.
            stream.start();

            FileSystem fs = FileSystem.get(new Configuration());

            List<String> fileList = new ArrayList<>();
            FileStatus[] fileStatus;
            try {
                fileStatus = fs.listStatus(new Path(config.path));
                for (FileStatus fileStat : fileStatus) {
                    fileList.add(fileStat.getPath().toString());
                }
            } catch (FileNotFoundException e) {
                throw new IllegalArgumentException(String.format(String.format(
                        "Path %s not found on file system. Please provide correct path.", config.path), e));
            }

            if (fileStatus.length <= 0) {
                LOG.warn("No files available to load into vertica database");
            }

            for (String file : fileList) {
                Path path = new Path(file);

                FSDataInputStream inputStream = fs.open(path);
                // Add stream to the VerticaCopyStream
                stream.addStream(inputStream);

                // call execute() to load the newly added stream. You could
                // add many streams and call execute once to load them all.
                // Which method you choose depends mainly on whether you want
                // the ability to check the number of rejections as the load
                // progresses so you can stop if the number of rejects gets too
                // high. Also, high numbers of InputStreams could create a
                // resource issue on your client system.
                stream.execute();

                // Show any rejects from this execution of the stream load
                // getRejects() returns a List containing the
                // row numbers of rejected rows.
                List<Long> rejects = stream.getRejects();

                // The size of the list gives you the number of rejected rows.
                int numRejects = rejects.size();
                totalRejects += numRejects;
                if (config.autoCommit.equalsIgnoreCase("true")) {
                    // Commit the loaded data
                    connection.commit();
                }
            }

            // Finish closes the COPY command. It returns the number of
            // rows inserted.
            long results = stream.finish();

            context.getMetrics().gauge("num.of.rows.rejected", totalRejects);
            context.getMetrics().gauge("num.of.rows.inserted", results);

            // Commit the loaded data
            connection.commit();
        }
    } catch (Exception e) {
        throw new RuntimeException(String.format("Exception while running copy statement %s", copyStatement),
                e);
    } finally {
        DriverManager.deregisterDriver((Driver) driver);
    }
}

From source file:co.cask.hydrator.plugin.HDFSSinkTest.java

License:Apache License

@Before
public void beforeTest() throws Exception {
    // Setup Hadoop Minicluster
    File baseDir = temporaryFolder.newFolder();
    Configuration conf = new Configuration();
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath());
    MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
    dfsCluster = builder.build();//w w  w  .j  a  v  a  2  s . c  o m
    dfsCluster.waitActive();
    fileSystem = FileSystem.get(conf);
}

From source file:co.cask.hydrator.plugin.hive.action.HiveExport.java

License:Apache License

@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
    //validate hive command. For export we only accept Select statements
    SqlParser parser = SqlParser.create(config.statement);
    try {/* www .  jav  a 2  s.c o  m*/
        SqlNode sqlNode = parser.parseQuery();
        if (!(sqlNode instanceof SqlSelect)) {
            throw new IllegalArgumentException(
                    "Hive Export only uses Select statements. Please provide valid hive "
                            + "select statement.");
        }
    } catch (SqlParseException e) {
        throw new IllegalArgumentException(
                "Error while parsing select statement. Please provide a valid hive select " + "statement.");
    }

    // validate if the directory already exists
    if (config.overwrite.equalsIgnoreCase("no")) {
        Configuration configuration = new Configuration();
        try {
            FileSystem fs = FileSystem.get(configuration);
            if (fs.exists(new Path(config.path))) {
                throw new IllegalArgumentException(
                        String.format("The path %s already exists. Please either delete that "
                                + "path or provide another path.", config.path));
            }
        } catch (IOException e) {
            throw new RuntimeException("Exception occurred while doing directory check", e);
        }

    }
}

From source file:co.cask.tephra.hbase10.coprocessor.TransactionProcessorTest.java

License:Apache License

private HRegion createRegion(String tableName, byte[] family, long ttl) throws IOException {
    HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName));
    HColumnDescriptor cfd = new HColumnDescriptor(family);
    if (ttl > 0) {
        cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(ttl));
    }//w  w w  .j  av  a  2  s  .  c  om
    cfd.setMaxVersions(10);
    htd.addFamily(cfd);
    htd.addCoprocessor(TransactionProcessor.class.getName());
    Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), htd.getTableName());
    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.mkdirs(tablePath));
    WALFactory walFactory = new WALFactory(conf, null, tableName + ".hlog");
    WAL hLog = walFactory.getWAL(new byte[] { 1 });
    HRegionInfo regionInfo = new HRegionInfo(TableName.valueOf(tableName));
    HRegionFileSystem regionFS = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tablePath, regionInfo);
    return new HRegion(regionFS, hLog, conf, htd, new LocalRegionServerServices(conf,
            ServerName.valueOf(InetAddress.getLocalHost().getHostName(), 0, System.currentTimeMillis())));
}

From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java

License:Apache License

@Test
public void testDataJanitorRegionScanner() throws Exception {
    String tableName = "TestDataJanitorRegionScanner";
    byte[] familyBytes = Bytes.toBytes("f");
    byte[] columnBytes = Bytes.toBytes("c");
    HTableDescriptor htd = new HTableDescriptor(tableName);
    HColumnDescriptor cfd = new HColumnDescriptor(familyBytes);
    // with that, all older than upper visibility bound by 3 hours should be expired by TTL logic
    cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(TimeUnit.HOURS.toMillis(3)));
    cfd.setMaxVersions(10);/*  w  w w . j  a  v a2 s  .  com*/
    htd.addFamily(cfd);
    htd.addCoprocessor(TransactionProcessor.class.getName());
    Path tablePath = new Path("/tmp/" + tableName);
    Path hlogPath = new Path("/tmp/hlog");
    Path oldPath = new Path("/tmp/.oldLogs");
    Configuration hConf = conf;
    FileSystem fs = FileSystem.get(hConf);
    assertTrue(fs.mkdirs(tablePath));
    HLog hlog = new HLog(fs, hlogPath, oldPath, hConf);
    HRegion region = new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd,
            new MockRegionServerServices());
    try {
        region.initialize();
        TransactionStateCache cache = new TransactionStateCacheSupplier(hConf).get();
        LOG.info("Coprocessor is using transaction state: " + cache.getLatestState());

        for (int i = 1; i <= 8; i++) {
            for (int k = 1; k <= i; k++) {
                Put p = new Put(Bytes.toBytes(i));
                p.add(familyBytes, columnBytes, V[k], Bytes.toBytes(V[k]));
                region.put(p);
            }
        }

        List<KeyValue> results = Lists.newArrayList();

        // force a flush to clear the data
        // during flush, the coprocessor should drop all KeyValues with timestamps in the invalid set
        LOG.info("Flushing region " + region.getRegionNameAsString());
        region.flushcache();

        // now a normal scan should only return the valid rows - testing that cleanup works on flush
        Scan scan = new Scan();
        scan.setMaxVersions(10);
        RegionScanner regionScanner = region.getScanner(scan);

        // first returned value should be "4" with version "4"
        results.clear();
        assertTrue(regionScanner.next(results));
        assertKeyValueMatches(results, 4, new long[] { V[4] });

        results.clear();
        assertTrue(regionScanner.next(results));
        assertKeyValueMatches(results, 5, new long[] { V[4] });

        results.clear();
        assertTrue(regionScanner.next(results));
        assertKeyValueMatches(results, 6, new long[] { V[6], V[4] });

        results.clear();
        assertTrue(regionScanner.next(results));
        assertKeyValueMatches(results, 7, new long[] { V[6], V[4] });

        results.clear();
        assertFalse(regionScanner.next(results));
        assertKeyValueMatches(results, 8, new long[] { V[8], V[6], V[4] });
    } finally {
        region.close();
    }
}

From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java

License:Apache License

@Test
public void testDeleteFiltering() throws Exception {
    String tableName = "TestDeleteFiltering";
    byte[] familyBytes = Bytes.toBytes("f");
    byte[] columnBytes = Bytes.toBytes("c");
    HTableDescriptor htd = new HTableDescriptor(tableName);
    HColumnDescriptor cfd = new HColumnDescriptor(familyBytes);
    cfd.setMaxVersions(10);//  ww  w.j  a  va2s . c o  m
    htd.addFamily(cfd);
    htd.addCoprocessor(TransactionProcessor.class.getName());
    Path tablePath = new Path("/tmp/" + tableName);
    Path hlogPath = new Path("/tmp/hlog-" + tableName);
    Path oldPath = new Path("/tmp/.oldLogs-" + tableName);
    Configuration hConf = conf;
    FileSystem fs = FileSystem.get(hConf);
    assertTrue(fs.mkdirs(tablePath));
    HLog hlog = new HLog(fs, hlogPath, oldPath, hConf);
    HRegion region = new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd,
            new MockRegionServerServices());
    try {
        region.initialize();
        TransactionStateCache cache = new TransactionStateCacheSupplier(hConf).get();
        LOG.info("Coprocessor is using transaction state: " + cache.getLatestState());

        byte[] row = Bytes.toBytes(1);
        for (int i = 4; i < V.length; i++) {
            if (i != 5) {
                Put p = new Put(row);
                p.add(familyBytes, columnBytes, V[i], Bytes.toBytes(V[i]));
                region.put(p);
            }
        }

        // delete from the third entry back
        Delete d = new Delete(row, V[5]);
        region.delete(d, false);

        List<KeyValue> results = Lists.newArrayList();

        // force a flush to clear the data
        // during flush, we should drop the deleted version, but not the others
        LOG.info("Flushing region " + region.getRegionNameAsString());
        region.flushcache();

        // now a normal scan should return row with versions at: V[8], V[6].
        // V[7] is invalid and V[5] and prior are deleted.
        Scan scan = new Scan();
        scan.setMaxVersions(10);
        RegionScanner regionScanner = region.getScanner(scan);
        // should be only one row
        assertFalse(regionScanner.next(results));
        assertKeyValueMatches(results, 1, new long[] { V[8], V[6] });
    } finally {
        region.close();
    }
}

From source file:co.cask.tephra.hbase94.coprocessor.TransactionProcessorTest.java

License:Apache License

private HRegion createRegion(String tableName, byte[] family, long ttl) throws IOException {
    HTableDescriptor htd = new HTableDescriptor(tableName);
    HColumnDescriptor cfd = new HColumnDescriptor(family);
    if (ttl > 0) {
        cfd.setValue(TxConstants.PROPERTY_TTL, String.valueOf(ttl));
    }//  ww  w . j av a2 s  . co m
    cfd.setMaxVersions(10);
    htd.addFamily(cfd);
    htd.addCoprocessor(TransactionProcessor.class.getName());
    Path tablePath = new Path("/tmp/" + tableName);
    Path hlogPath = new Path("/tmp/hlog-" + tableName);
    Path oldPath = new Path("/tmp/.oldLogs-" + tableName);
    Configuration hConf = conf;
    FileSystem fs = FileSystem.get(hConf);
    assertTrue(fs.mkdirs(tablePath));
    HLog hlog = new HLog(fs, hlogPath, oldPath, hConf);
    return new HRegion(tablePath, hlog, fs, hConf, new HRegionInfo(Bytes.toBytes(tableName)), htd,
            new MockRegionServerServices());
}