List of usage examples for org.apache.hadoop.fs FileSystem copyFromLocalFile
public void copyFromLocalFile(Path src, Path dst) throws IOException
From source file:com.tdunning.plume.local.lazy.MapRedSingleFlattenChannelTest.java
License:Apache License
@Test public void test() throws Exception { String outputPath = "/tmp/output-plume-singleflattenchanneltest"; String inputPath = "/tmp/input-wordcount.txt"; String inputPath2 = "/tmp/input-moretext.txt"; // Prepare input for test FileSystem system = FileSystem.getLocal(new Configuration()); system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath)); system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath2)); // Prepare output for test system.delete(new Path(outputPath), true); // Prepare workflow MapRedSingleFlattenChannelTestWorkflow workFlow = new MapRedSingleFlattenChannelTestWorkflow(); // Execute it MapRedExecutor executor = new MapRedExecutor(); executor.execute(workFlow, outputPath); /**//from www .ja v a 2s . c o m * TODO add test validation */ }
From source file:com.tdunning.plume.local.lazy.MapRedTwoSequentialGBKTest.java
License:Apache License
@Test public void test() throws Exception { String outputPath = "/tmp/output-plume-twosequentialgbktest"; String inputPath = "/tmp/input-wordcount.txt"; // Prepare input for test FileSystem system = FileSystem.getLocal(new Configuration()); system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath)); // Prepare output for test system.delete(new Path(outputPath), true); // Prepare workflow TwoSequentialGBKWorkflow workFlow = new TwoSequentialGBKWorkflow(); // Execute it MapRedExecutor executor = new MapRedExecutor(); executor.execute(workFlow, outputPath); String outputId = ((LazyCollection<?>) workFlow.getOutputs().get(0)).getPlumeId(); List<String> str = Files.readLines(new File(outputPath + "/" + outputId + "/1-r-00000"), Charsets.UTF_8); Map<String, String> m = Maps.newHashMap(); for (String line : str) { m.put(line.split("\t")[0], line.split("\t")[1]); // not super-optimal, but less code }// ww w .j av a 2 s.c o m assertEquals("bar 2", m.get("To test text processing with some simple")); assertEquals("bar 2", m.get("examples is some simple")); assertEquals("bar 2", m.get("is is")); assertEquals("bar 2", m.get("some simple text")); }
From source file:com.tdunning.plume.local.lazy.MapRedWordCountTest.java
License:Apache License
/** * The wordcount example to test with local hadoop * //from w ww .jav a 2 s . c o m * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ @Test public void testWordCount() throws IOException, InterruptedException, ClassNotFoundException { String inputPath = "/tmp/input-wordcount.txt"; String outputPath = "/tmp/output-plume-wordcount"; // Prepare input for test FileSystem system = FileSystem.getLocal(new Configuration()); system.copyFromLocalFile(new Path(Resources.getResource("simple-text.txt").getPath()), new Path(inputPath)); // Prepare output for test system.delete(new Path(outputPath), true); // Prepare workflow WordCountWorkflow workFlow = new WordCountWorkflow(); // Execute it MapRedExecutor executor = new MapRedExecutor(); executor.execute(workFlow, outputPath); List<String> str = Files.readLines(new File(outputPath + "/1_1/1-r-00000"), Charsets.UTF_8); Map<String, String> m = Maps.newHashMap(); for (String line : str) { m.put(line.split("\t")[0], line.split("\t")[1]); // not super-optimal, but less code } assertEquals(3 + "", m.get("is")); assertEquals(3 + "", m.get("some")); assertEquals(3 + "", m.get("simple")); assertEquals(1 + "", m.get("examples")); assertEquals(2 + "", m.get("text")); }
From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingReducer.java
License:Apache License
@Override public void cleanup(Reducer<KIN, VIN, NullWritable, NullWritable>.Context context) throws IOException { // This may take a while... indexer.close();/*from w ww.j a va2 s . com*/ LOG.info("Done finalizing index!"); LOG.info(cnt + " records added to the index"); LOG.info(skipped + " records skipped"); // Copy from local back to HDFS. Path destination = new Path(context.getConfiguration().get(HDFS_INDEX_LOCATION)); LOG.info("final index destination: " + destination); LOG.info("copying from " + tmpIndex + " to " + destination); FileSystem fs = FileSystem.get(context.getConfiguration()); if (!fs.exists(destination)) { fs.mkdirs(destination); } fs.copyFromLocalFile(new Path(tmpIndex.getAbsolutePath()), destination); LOG.info("copying complete!"); // Clean up local tmp directory. FileUtil.fullyDelete(tmpIndex); LOG.info("local directory " + tmpIndex + " removed!"); heartbeatThread.interrupt(); }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
/** * @param hdfs// w ww. j av a2 s . c o m * FileSystem handle * @param f * file to copy to HDFS * @param outputPath * @param skipExisting * skip if the file already exist in the target. File will be * overwritten if already there and this argument is false. * @throws IOException * if target directory cannot be created or file cannot be copied to * target directory. */ private void processPlainFile(FileSystem hdfs, File f, Path outputPath, boolean skipExisting) throws IOException { long fileModTime = f.lastModified(); Path targetDir = getTargetDirectory(hdfs, outputPath, fileModTime); boolean doCopy = true; Path sourceFile = new Path(f.getPath()); if (skipExisting) { Path target = new Path(targetDir, sourceFile.getName()); if (hdfs.exists(target)) { doCopy = false; } } if (doCopy) { hdfs.copyFromLocalFile(sourceFile, targetDir); } }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
@Test public void testPruneFileListBySize() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os);/* w w w . j av a 2s. c o m*/ assertTrue(hdfs.exists(inputPath)); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[0] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 2); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * removes conf file which has already been put in prunedList * * @throws IOException/*w w w . j a v a2 s . c om*/ */ @Test public void testPruneFileListRemovingConfFromPruneList() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_pruneList"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_pruneList"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[0] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * tests the case when several files are spread out in the dir and need to be removed * * @throws IOException//from w ww. j a v a2 s. c om */ @Test public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[12]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_multiple"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_multiple"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[2] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[3] = hdfs.getFileStatus(expPath); Path inputPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath2); assertTrue(hdfs.exists(inputPath2)); origList[4] = hdfs.getFileStatus(inputPath2); Path inputPath3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath3); assertTrue(hdfs.exists(inputPath3)); origList[5] = hdfs.getFileStatus(inputPath3); Path inputPath4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath4); assertTrue(hdfs.exists(inputPath4)); origList[6] = hdfs.getFileStatus(inputPath4); Path emptyFile2 = new Path( inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist"); os = hdfs.createNewFile(emptyFile2); assertTrue(os); assertTrue(hdfs.exists(emptyFile2)); origList[7] = hdfs.getFileStatus(emptyFile2); Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml"); os = hdfs.createNewFile(emptyConfFile2); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile2)); origList[8] = hdfs.getFileStatus(emptyConfFile2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml"); os = hdfs.createNewFile(emptyConfFile3); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile3)); origList[9] = hdfs.getFileStatus(emptyConfFile3); Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml"); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputConfPath2); assertTrue(hdfs.exists(inputConfPath2)); origList[10] = hdfs.getFileStatus(inputConfPath2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml"); os = hdfs.createNewFile(emptyConfFile4); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile4)); origList[11] = hdfs.getFileStatus(emptyConfFile4); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 4); }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java
License:Apache License
public DAG createDAG(TezOperPlan tezPlan, FileSystem remoteFs, TezConfiguration conf, ApplicationId appId, Path remoteStagingDir) throws IOException, YarnException { DAG dag = new DAG("MRRSleepJob"); /*//from w w w. java 2 s . c o m String jarPath = ClassUtil.findContainingJar(getClass()); Path remoteJarPath = remoteFs.makeQualified( new Path(remoteStagingDir, "dag_job.jar")); remoteFs.copyFromLocalFile(new Path(jarPath), remoteJarPath); FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath); */ Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>(); if (!pigContext.inIllustrator && pigContext.getExecType() != ExecType.TEZ_LOCAL) { // Setup the DistributedCache for this job for (URL extraJar : pigContext.extraJars) { //log.debug("Adding jar to DistributedCache: " + extraJar.toString()); TezJobControlCompiler.putJarOnClassPathThroughDistributedCache(pigContext, conf, extraJar); } //Create the jar of all functions and classes required File submitJarFile = File.createTempFile("Job", ".jar"); //log.info("creating jar file "+submitJarFile.getName()); // ensure the job jar is deleted on exit submitJarFile.deleteOnExit(); FileOutputStream fos = new FileOutputStream(submitJarFile); try { JarManager.createJar(fos, new HashSet<String>(), pigContext); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } Path remoteJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "dag_job.jar")); remoteFs.copyFromLocalFile(new Path(submitJarFile.getAbsolutePath()), remoteJarPath); FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath); LocalResource dagJarLocalRsrc = LocalResource.newInstance( ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(), jarFileStatus.getModificationTime()); commonLocalResources.put("dag_job.jar", dagJarLocalRsrc); Path remoteTezJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "pig-tez.jar")); remoteFs.copyFromLocalFile(new Path("pig-tez.jar"), remoteTezJarPath); FileStatus tezJarFileStatus = remoteFs.getFileStatus(remoteTezJarPath); LocalResource tezJarLocalRsrc = LocalResource.newInstance( ConverterUtils.getYarnUrlFromPath(remoteTezJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, tezJarFileStatus.getLen(), tezJarFileStatus.getModificationTime()); commonLocalResources.put("pig-tez.jar", tezJarLocalRsrc); //log.info("jar file "+submitJarFile.getName()+" created"); //Start setting the JobConf properties conf.set("mapred.jar", submitJarFile.getPath()); } /* LocalResource dagJarLocalRsrc = LocalResource.newInstance( ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(), jarFileStatus.getModificationTime()); commonLocalResources.put("dag_job.jar", dagJarLocalRsrc); */ Hashtable<TezOperator, Pair<Vertex, Configuration>> vertexMap = new Hashtable<TezOperator, Pair<Vertex, Configuration>>(); List<TezOperator> operators = tezPlan.getRoots(); // add settings for pig statistics String setScriptProp = conf.get(ScriptState.INSERT_ENABLED, "true"); ScriptState ss = null; if (setScriptProp.equalsIgnoreCase("true")) { ss = ScriptState.get(); } while (operators != null && operators.size() != 0) { List<TezOperator> successors = new ArrayList<TezOperator>(); for (TezOperator oper : operators) { Configuration operConf = oper.configure(pigContext, conf); /* if (ss != null){ ss.addSettingsToConf(oper, conf); } */ List<TezOperator> predecessors = plan.getPredecessors(oper); if (predecessors != null && predecessors.size() != 0) { MultiStageMRConfToTezTranslator.translateVertexConfToTez(operConf, vertexMap.get(predecessors.get(0)).second); } else { MultiStageMRConfToTezTranslator.translateVertexConfToTez(operConf, null); } List<TezOperator> operSuccessors = tezPlan.getSuccessors(oper); if (operSuccessors != null) { successors.addAll(operSuccessors); } MRHelpers.doJobClientMagic(operConf); //mapStageConf.setInt(MRJobConfig.NUM_MAPS, numMapper); Vertex operVertex = new Vertex(oper.name(), new ProcessorDescriptor(oper.getProcessor(), MRHelpers.createUserPayloadFromConf(operConf)), oper.getParallelism(), MRHelpers.getMapResource(operConf)); oper.configureVertex(operVertex, operConf, commonLocalResources, remoteStagingDir); dag.addVertex(operVertex); if (predecessors != null) { for (TezOperator predecessor : predecessors) { dag.addEdge(new Edge(vertexMap.get(predecessor).first, operVertex, tezPlan.getEdgeProperty(predecessor, oper))); } } vertexMap.put(oper, new Pair<Vertex, Configuration>(operVertex, operConf)); } operators = successors; } return dag; }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java
License:Apache License
public static void setupDistributedCache(PigContext pigContext, Configuration conf, String[] paths, boolean shipToCluster) throws IOException { // Turn on the symlink feature DistributedCache.createSymlink(conf); for (String path : paths) { path = path.trim();// w w w . j a v a 2s. com if (path.length() != 0) { Path src = new Path(path); // Ensure that 'src' is a valid URI URI srcURI = toURI(src); // Ship it to the cluster if necessary and add to the // DistributedCache if (shipToCluster) { Path dst = new Path(FileLocalizer.getTemporaryPath(pigContext).toString()); FileSystem fs = dst.getFileSystem(conf); fs.copyFromLocalFile(src, dst); // Construct the dst#srcName uri for DistributedCache URI dstURI = null; try { dstURI = new URI(dst.toString() + "#" + src.getName()); } catch (URISyntaxException ue) { byte errSrc = pigContext.getErrorSource(); int errCode = 0; switch (errSrc) { case PigException.REMOTE_ENVIRONMENT: errCode = 6004; break; case PigException.USER_ENVIRONMENT: errCode = 4004; break; default: errCode = 2037; break; } String msg = "Invalid ship specification. " + "File doesn't exist: " + dst; throw new ExecException(msg, errCode, errSrc); } DistributedCache.addCacheFile(dstURI, conf); } else { DistributedCache.addCacheFile(srcURI, conf); } } } }