Example usage for org.apache.hadoop.fs FileSystem copyFromLocalFile

List of usage examples for org.apache.hadoop.fs FileSystem copyFromLocalFile

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem copyFromLocalFile.

Prototype

public void copyFromLocalFile(Path src, Path dst) throws IOException 

Source Link

Document

The src file is on the local disk.

Usage

From source file:com.tdunning.plume.local.lazy.MapRedSingleFlattenChannelTest.java

License:Apache License

@Test
public void test() throws Exception {
    String outputPath = "/tmp/output-plume-singleflattenchanneltest";
    String inputPath = "/tmp/input-wordcount.txt";
    String inputPath2 = "/tmp/input-moretext.txt";
    // Prepare input for test
    FileSystem system = FileSystem.getLocal(new Configuration());
    system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath));
    system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()),
            new Path(inputPath2));
    // Prepare output for test
    system.delete(new Path(outputPath), true);
    // Prepare workflow
    MapRedSingleFlattenChannelTestWorkflow workFlow = new MapRedSingleFlattenChannelTestWorkflow();
    // Execute it
    MapRedExecutor executor = new MapRedExecutor();
    executor.execute(workFlow, outputPath);
    /**//from www .ja v a  2s  . c o m
     * TODO add test validation
     */
}

From source file:com.tdunning.plume.local.lazy.MapRedTwoSequentialGBKTest.java

License:Apache License

@Test
public void test() throws Exception {
    String outputPath = "/tmp/output-plume-twosequentialgbktest";
    String inputPath = "/tmp/input-wordcount.txt";
    // Prepare input for test
    FileSystem system = FileSystem.getLocal(new Configuration());
    system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath));
    // Prepare output for test
    system.delete(new Path(outputPath), true);
    // Prepare workflow
    TwoSequentialGBKWorkflow workFlow = new TwoSequentialGBKWorkflow();

    // Execute it
    MapRedExecutor executor = new MapRedExecutor();
    executor.execute(workFlow, outputPath);

    String outputId = ((LazyCollection<?>) workFlow.getOutputs().get(0)).getPlumeId();
    List<String> str = Files.readLines(new File(outputPath + "/" + outputId + "/1-r-00000"), Charsets.UTF_8);

    Map<String, String> m = Maps.newHashMap();
    for (String line : str) {
        m.put(line.split("\t")[0], line.split("\t")[1]); // not super-optimal, but less code
    }// ww w  .j  av  a 2  s.c o m
    assertEquals("bar 2", m.get("To test text processing with some simple"));
    assertEquals("bar 2", m.get("examples is some simple"));
    assertEquals("bar 2", m.get("is is"));
    assertEquals("bar 2", m.get("some simple text"));
}

From source file:com.tdunning.plume.local.lazy.MapRedWordCountTest.java

License:Apache License

/**
 * The wordcount example to test with local hadoop
 * //from  w  ww .jav a  2 s  . c  o  m
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
@Test
public void testWordCount() throws IOException, InterruptedException, ClassNotFoundException {
    String inputPath = "/tmp/input-wordcount.txt";
    String outputPath = "/tmp/output-plume-wordcount";
    // Prepare input for test
    FileSystem system = FileSystem.getLocal(new Configuration());
    system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath));
    // Prepare output for test
    system.delete(new Path(outputPath), true);
    // Prepare workflow
    WordCountWorkflow workFlow = new WordCountWorkflow();
    // Execute it
    MapRedExecutor executor = new MapRedExecutor();
    executor.execute(workFlow, outputPath);

    List<String> str = Files.readLines(new File(outputPath + "/1_1/1-r-00000"), Charsets.UTF_8);

    Map<String, String> m = Maps.newHashMap();
    for (String line : str) {
        m.put(line.split("\t")[0], line.split("\t")[1]); // not super-optimal, but less code
    }
    assertEquals(3 + "", m.get("is"));
    assertEquals(3 + "", m.get("some"));
    assertEquals(3 + "", m.get("simple"));
    assertEquals(1 + "", m.get("examples"));
    assertEquals(2 + "", m.get("text"));
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingReducer.java

License:Apache License

@Override
public void cleanup(Reducer<KIN, VIN, NullWritable, NullWritable>.Context context) throws IOException {
    // This may take a while...
    indexer.close();/*from   w ww.j a va2  s  . com*/
    LOG.info("Done finalizing index!");

    LOG.info(cnt + " records added to the index");
    LOG.info(skipped + " records skipped");

    // Copy from local back to HDFS.
    Path destination = new Path(context.getConfiguration().get(HDFS_INDEX_LOCATION));
    LOG.info("final index destination: " + destination);
    LOG.info("copying from " + tmpIndex + " to " + destination);

    FileSystem fs = FileSystem.get(context.getConfiguration());

    if (!fs.exists(destination)) {
        fs.mkdirs(destination);
    }

    fs.copyFromLocalFile(new Path(tmpIndex.getAbsolutePath()), destination);
    LOG.info("copying complete!");

    // Clean up local tmp directory.
    FileUtil.fullyDelete(tmpIndex);
    LOG.info("local directory " + tmpIndex + " removed!");

    heartbeatThread.interrupt();
}

From source file:com.twitter.hraven.etl.JobFilePartitioner.java

License:Apache License

/**
 * @param hdfs//  w ww.  j av a2 s . c o m
 *          FileSystem handle
 * @param f
 *          file to copy to HDFS
 * @param outputPath
 * @param skipExisting
 *          skip if the file already exist in the target. File will be
 *          overwritten if already there and this argument is false.
 * @throws IOException
 *           if target directory cannot be created or file cannot be copied to
 *           target directory.
 */
private void processPlainFile(FileSystem hdfs, File f, Path outputPath, boolean skipExisting)
        throws IOException {
    long fileModTime = f.lastModified();
    Path targetDir = getTargetDirectory(hdfs, outputPath, fileModTime);

    boolean doCopy = true;
    Path sourceFile = new Path(f.getPath());
    if (skipExisting) {
        Path target = new Path(targetDir, sourceFile.getName());
        if (hdfs.exists(target)) {
            doCopy = false;
        }
    }
    if (doCopy) {
        hdfs.copyFromLocalFile(sourceFile, targetDir);
    }

}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

@Test
public void testPruneFileListBySize() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[2];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);/* w w  w . j  av a 2s.  c  o m*/
    assertTrue(hdfs.exists(inputPath));

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[0] = hdfs.getFileStatus(expPath);

    final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml";
    File jobConfFile = new File(JOB_CONF_FILE_NAME);
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[1] = hdfs.getFileStatus(expPath);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 0);

    Path emptyFile = new Path(
            inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist");
    os = hdfs.createNewFile(emptyFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile));
    origList[0] = hdfs.getFileStatus(emptyFile);

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);

    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[1] = hdfs.getFileStatus(emptyConfFile);

    prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 2);

}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

/**
 * removes conf file which has already been put in prunedList
 *
 * @throws IOException/*w  w w .  j  a v a2  s .  c  om*/
 */
@Test
public void testPruneFileListRemovingConfFromPruneList() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[2];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize_pruneList");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);
    assertTrue(hdfs.exists(inputPath));

    Path relocationPath = new Path("/relocation_filesize_pruneList");
    os = hdfs.mkdirs(relocationPath);
    assertTrue(os);
    assertTrue(hdfs.exists(relocationPath));

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[0] = hdfs.getFileStatus(emptyConfFile);

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[1] = hdfs.getFileStatus(expPath);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 0);
}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

/**
 * tests the case when several files are spread out in the dir and need to be removed
 *
 * @throws IOException//from   w  ww. j  a  v  a2  s.  c  om
 */
@Test
public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[12];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize_multiple");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);
    assertTrue(hdfs.exists(inputPath));

    Path relocationPath = new Path("/relocation_filesize_multiple");
    os = hdfs.mkdirs(relocationPath);
    assertTrue(os);
    assertTrue(hdfs.exists(relocationPath));

    Path emptyFile = new Path(
            inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist");
    os = hdfs.createNewFile(emptyFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile));
    origList[0] = hdfs.getFileStatus(emptyFile);

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);

    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[1] = hdfs.getFileStatus(emptyConfFile);

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[2] = hdfs.getFileStatus(expPath);

    final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml";
    File jobConfFile = new File(JOB_CONF_FILE_NAME);
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[3] = hdfs.getFileStatus(expPath);

    Path inputPath2 = new Path(inputPath.toUri() + "/"
            + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath2);
    assertTrue(hdfs.exists(inputPath2));
    origList[4] = hdfs.getFileStatus(inputPath2);

    Path inputPath3 = new Path(inputPath.toUri() + "/"
            + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath3);
    assertTrue(hdfs.exists(inputPath3));
    origList[5] = hdfs.getFileStatus(inputPath3);

    Path inputPath4 = new Path(inputPath.toUri() + "/"
            + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath4);
    assertTrue(hdfs.exists(inputPath4));
    origList[6] = hdfs.getFileStatus(inputPath4);

    Path emptyFile2 = new Path(
            inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist");
    os = hdfs.createNewFile(emptyFile2);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile2));
    origList[7] = hdfs.getFileStatus(emptyFile2);

    Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml");
    os = hdfs.createNewFile(emptyConfFile2);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile2));
    origList[8] = hdfs.getFileStatus(emptyConfFile2);

    // this is an empty file which tests the toBeRemovedFileList
    // at the end of function pruneFileListBySize
    Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml");
    os = hdfs.createNewFile(emptyConfFile3);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile3));
    origList[9] = hdfs.getFileStatus(emptyConfFile3);

    Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml");
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputConfPath2);
    assertTrue(hdfs.exists(inputConfPath2));
    origList[10] = hdfs.getFileStatus(inputConfPath2);

    // this is an empty file which tests the toBeRemovedFileList
    // at the end of function pruneFileListBySize
    Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml");
    os = hdfs.createNewFile(emptyConfFile4);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile4));
    origList[11] = hdfs.getFileStatus(emptyConfFile4);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 4);
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java

License:Apache License

public DAG createDAG(TezOperPlan tezPlan, FileSystem remoteFs, TezConfiguration conf, ApplicationId appId,
        Path remoteStagingDir) throws IOException, YarnException {

    DAG dag = new DAG("MRRSleepJob");
    /*//from   w w  w.  java 2  s .  c  o m
          String jarPath = ClassUtil.findContainingJar(getClass());
          Path remoteJarPath = remoteFs.makeQualified(
    new Path(remoteStagingDir, "dag_job.jar"));
          remoteFs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
          FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath);
    */
    Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>();

    if (!pigContext.inIllustrator && pigContext.getExecType() != ExecType.TEZ_LOCAL) {

        // Setup the DistributedCache for this job
        for (URL extraJar : pigContext.extraJars) {
            //log.debug("Adding jar to DistributedCache: " + extraJar.toString());
            TezJobControlCompiler.putJarOnClassPathThroughDistributedCache(pigContext, conf, extraJar);
        }

        //Create the jar of all functions and classes required
        File submitJarFile = File.createTempFile("Job", ".jar");
        //log.info("creating jar file "+submitJarFile.getName());
        // ensure the job jar is deleted on exit
        submitJarFile.deleteOnExit();
        FileOutputStream fos = new FileOutputStream(submitJarFile);
        try {
            JarManager.createJar(fos, new HashSet<String>(), pigContext);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        Path remoteJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "dag_job.jar"));
        remoteFs.copyFromLocalFile(new Path(submitJarFile.getAbsolutePath()), remoteJarPath);
        FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath);

        LocalResource dagJarLocalRsrc = LocalResource.newInstance(
                ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE,
                LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(),
                jarFileStatus.getModificationTime());
        commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);

        Path remoteTezJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "pig-tez.jar"));
        remoteFs.copyFromLocalFile(new Path("pig-tez.jar"), remoteTezJarPath);
        FileStatus tezJarFileStatus = remoteFs.getFileStatus(remoteTezJarPath);

        LocalResource tezJarLocalRsrc = LocalResource.newInstance(
                ConverterUtils.getYarnUrlFromPath(remoteTezJarPath), LocalResourceType.FILE,
                LocalResourceVisibility.APPLICATION, tezJarFileStatus.getLen(),
                tezJarFileStatus.getModificationTime());
        commonLocalResources.put("pig-tez.jar", tezJarLocalRsrc);

        //log.info("jar file "+submitJarFile.getName()+" created");
        //Start setting the JobConf properties
        conf.set("mapred.jar", submitJarFile.getPath());
    }

    /*
    LocalResource dagJarLocalRsrc = LocalResource.newInstance(
    ConverterUtils.getYarnUrlFromPath(remoteJarPath),
    LocalResourceType.FILE,
    LocalResourceVisibility.APPLICATION,
    jarFileStatus.getLen(),
    jarFileStatus.getModificationTime());
    commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
    */

    Hashtable<TezOperator, Pair<Vertex, Configuration>> vertexMap = new Hashtable<TezOperator, Pair<Vertex, Configuration>>();

    List<TezOperator> operators = tezPlan.getRoots();

    // add settings for pig statistics
    String setScriptProp = conf.get(ScriptState.INSERT_ENABLED, "true");
    ScriptState ss = null;

    if (setScriptProp.equalsIgnoreCase("true")) {
        ss = ScriptState.get();
    }

    while (operators != null && operators.size() != 0) {

        List<TezOperator> successors = new ArrayList<TezOperator>();

        for (TezOperator oper : operators) {

            Configuration operConf = oper.configure(pigContext, conf);
            /*
            if (ss != null){
               ss.addSettingsToConf(oper, conf);
            }
            */
            List<TezOperator> predecessors = plan.getPredecessors(oper);

            if (predecessors != null && predecessors.size() != 0) {
                MultiStageMRConfToTezTranslator.translateVertexConfToTez(operConf,
                        vertexMap.get(predecessors.get(0)).second);
            } else {
                MultiStageMRConfToTezTranslator.translateVertexConfToTez(operConf, null);
            }

            List<TezOperator> operSuccessors = tezPlan.getSuccessors(oper);
            if (operSuccessors != null) {
                successors.addAll(operSuccessors);
            }

            MRHelpers.doJobClientMagic(operConf);

            //mapStageConf.setInt(MRJobConfig.NUM_MAPS, numMapper);

            Vertex operVertex = new Vertex(oper.name(),
                    new ProcessorDescriptor(oper.getProcessor(), MRHelpers.createUserPayloadFromConf(operConf)),
                    oper.getParallelism(), MRHelpers.getMapResource(operConf));

            oper.configureVertex(operVertex, operConf, commonLocalResources, remoteStagingDir);

            dag.addVertex(operVertex);
            if (predecessors != null) {

                for (TezOperator predecessor : predecessors) {
                    dag.addEdge(new Edge(vertexMap.get(predecessor).first, operVertex,
                            tezPlan.getEdgeProperty(predecessor, oper)));
                }

            }

            vertexMap.put(oper, new Pair<Vertex, Configuration>(operVertex, operConf));
        }

        operators = successors;
    }
    return dag;
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java

License:Apache License

public static void setupDistributedCache(PigContext pigContext, Configuration conf, String[] paths,
        boolean shipToCluster) throws IOException {
    // Turn on the symlink feature
    DistributedCache.createSymlink(conf);

    for (String path : paths) {
        path = path.trim();//  w w w  .  j  a v a 2s.  com
        if (path.length() != 0) {
            Path src = new Path(path);

            // Ensure that 'src' is a valid URI
            URI srcURI = toURI(src);

            // Ship it to the cluster if necessary and add to the
            // DistributedCache
            if (shipToCluster) {
                Path dst = new Path(FileLocalizer.getTemporaryPath(pigContext).toString());
                FileSystem fs = dst.getFileSystem(conf);
                fs.copyFromLocalFile(src, dst);

                // Construct the dst#srcName uri for DistributedCache
                URI dstURI = null;
                try {
                    dstURI = new URI(dst.toString() + "#" + src.getName());
                } catch (URISyntaxException ue) {
                    byte errSrc = pigContext.getErrorSource();
                    int errCode = 0;
                    switch (errSrc) {
                    case PigException.REMOTE_ENVIRONMENT:
                        errCode = 6004;
                        break;
                    case PigException.USER_ENVIRONMENT:
                        errCode = 4004;
                        break;
                    default:
                        errCode = 2037;
                        break;
                    }
                    String msg = "Invalid ship specification. " + "File doesn't exist: " + dst;
                    throw new ExecException(msg, errCode, errSrc);
                }
                DistributedCache.addCacheFile(dstURI, conf);
            } else {
                DistributedCache.addCacheFile(srcURI, conf);
            }
        }
    }
}