List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.splunk.shuttl.prototype.symlink.BucketBlockSymlinkPrototypeTest.java
License:Apache License
private void createSymlinkToPathInDir(Path path, File dir) throws IOException { File fileInDir = new File(dir, path.getName()); DistributedFileSystem dfs = (DistributedFileSystem) hadoopFileSystem; ClientProtocol namenode = dfs.getClient().namenode; String pathOnHadoop = path.toUri().getPath(); LocatedBlocks blockLocations = namenode.getBlockLocations(pathOnHadoop, 0, Long.MAX_VALUE); List<LocatedBlock> locatedBlocks = blockLocations.getLocatedBlocks(); if (!locatedBlocks.isEmpty()) { doSymlinkPathInDir(fileInDir, blockLocations, locatedBlocks); } else {/*from ww w .j a v a 2 s. com*/ // Means that they don't have a block and that they are empty files. Just // create them. assertTrue(fileInDir.createNewFile()); } }
From source file:com.srini.hadoopYarn.Client.java
License:Apache License
/** * Main run function for the client//from w w w . j a v a 2s. c o m * @return true if application completed successfully * @throws IOException * @throws YarnException */ public boolean run() throws IOException, YarnException { LOG.info("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(NodeState.RUNNING); LOG.info("Got Cluster node info from ASM"); for (NodeReport node : clusterNodeReports) { LOG.info("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress() + ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers()); } QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); LOG.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity() + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size() + ", queueChildQueueCount=" + queueInfo.getChildQueues().size()); List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); // TODO get min/max resource capabilities from RM and change memory ask if needed // If we do not have min/max, we may not be able to correctly request // the required resources from the RM for the app master // Memory ask has to be a multiple of min and less than max. // Dump out information about cluster capability as seen by the resource manager int maxMem = appResponse.getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); // A resource ask cannot exceed the max. if (amMemory > maxMem) { LOG.info("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); ApplicationId appId = appContext.getApplicationId(); appContext.setApplicationName(appName); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); LOG.info("Copy App Master jar from local filesystem and add to local environment"); // Copy the application master jar to the filesystem // Create a local resource to point to the destination jar path FileSystem fs = FileSystem.get(conf); Path src = new Path(appMasterJar); String pathSuffix = appName + "/" + appId.getId() + "/AppMaster.jar"; Path dst = new Path(fs.getHomeDirectory(), pathSuffix); fs.copyFromLocalFile(false, true, src, dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource amJarRsrc = Records.newRecord(LocalResource.class); // Set the type of resource - file or archive // archives are untarred at destination // we don't need the jar file to be untarred for now amJarRsrc.setType(LocalResourceType.FILE); // Set visibility of the resource // Setting to most private option amJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION); // Set the resource to be copied over amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); // Set timestamp and length of file so that the framework // can do basic sanity checks for the local resource // after it has been copied over to ensure it is the same // resource the client intended to use with the application amJarRsrc.setTimestamp(destStatus.getModificationTime()); amJarRsrc.setSize(destStatus.getLen()); localResources.put("AppMaster.jar", amJarRsrc); // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(fs.getHomeDirectory(), "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } // The shell script has to be made available on the final container(s) // where it will be executed. // To do this, we need to first copy into the filesystem that is visible // to the yarn framework. // We do not need to set this as a local resource for the application // master as the application master does not need it. String hdfsShellScriptLocation = ""; long hdfsShellScriptLen = 0; long hdfsShellScriptTimestamp = 0; if (!shellScriptPath.isEmpty()) { Path shellSrc = new Path(shellScriptPath); String shellPathSuffix = appName + "/" + appId.getId() + "/ExecShellScript.sh"; Path shellDst = new Path(fs.getHomeDirectory(), shellPathSuffix); fs.copyFromLocalFile(false, true, shellSrc, shellDst); hdfsShellScriptLocation = shellDst.toUri().toString(); FileStatus shellFileStatus = fs.getFileStatus(shellDst); hdfsShellScriptLen = shellFileStatus.getLen(); hdfsShellScriptTimestamp = shellFileStatus.getModificationTime(); } // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed //amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // put location of shell script into env // using the env info, the application master will create the correct local resource for the // eventual containers that will be launched to execute the shell scripts env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION, hdfsShellScriptLocation); env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP, Long.toString(hdfsShellScriptTimestamp)); env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN, Long.toString(hdfsShellScriptLen)); // Add AppMaster.jar location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()).append(File.pathSeparatorChar) .append("./*"); for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { classPathEnv.append(File.pathSeparatorChar); classPathEnv.append(c.trim()); } classPathEnv.append(File.pathSeparatorChar).append("./log4j.properties"); // add the runtime classpath needed for tests to work if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(':'); classPathEnv.append(System.getProperty("java.class.path")); } env.put("CLASSPATH", classPathEnv.toString()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master Vector<CharSequence> vargs = new Vector<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(Environment.JAVA_HOME.$() + "/bin/java"); // Set Xmx based on am memory size vargs.add("-Xmx" + amMemory + "m"); // Set class name vargs.add(appMasterMainClass); // Set params for Application Master vargs.add("--container_memory " + String.valueOf(containerMemory)); vargs.add("--num_containers " + String.valueOf(numContainers)); vargs.add("--priority " + String.valueOf(shellCmdPriority)); if (!shellCommand.isEmpty()) { vargs.add("--shell_command " + shellCommand + ""); } if (!shellArgs.isEmpty()) { vargs.add("--shell_args " + shellArgs + ""); } for (Map.Entry<String, String> entry : shellEnv.entrySet()) { vargs.add("--shell_env " + entry.getKey() + "=" + entry.getValue()); } if (debugFlag) { vargs.add("--debug"); } vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final commmand StringBuilder command = new StringBuilder(); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); // TODO - what is the range for priority? how to decide? pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = applicationsManager.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure LOG.info("Submitting application to ASM"); yarnClient.submitApplication(appContext); // TODO // Try submitting the same request again // app submission failure? // Monitor the application return monitorApplication(appId); }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHDFSSourceIT.java
License:Apache License
@Test public void testWrongHDFSDirLocation() throws Exception { ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean(); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath()); conf.hdfsConfigs = new HashMap<>(); conf.hdfsConfigs.put("x", "X"); conf.dataFormat = DataFormat.TEXT;/* w ww . ja va 2s . co m*/ conf.dataFormatConfig.textMaxLineLen = 1024; conf.hdfsUri = "/pathwithnoschemeorauthority"; ClusterHdfsSource clusterHdfsSource = createSource(conf); try { List<ConfigIssue> issues = clusterHdfsSource.init(null, ContextInfoCreator .createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_02")); conf.hdfsUri = "file://localhost:8020/"; clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_12")); conf.hdfsUri = "hdfs:///noauthority"; clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_13")); conf.hdfsUri = "hdfs://localhost:50000"; clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_11")); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList("/pathdoesnotexist"); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_10")); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath()); FileSystem fs = miniDFS.getFileSystem(); Path someFile = new Path(new Path(dir.toUri()), "/someFile"); fs.create(someFile).close(); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); conf.hdfsUri = null; conf.hdfsConfigs.put(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, miniDFS.getURI().toString()); someFile = new Path(new Path(dir.toUri()), "/someFile2"); fs.create(someFile).close(); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); Path dummyFile = new Path(new Path(dir.toUri()), "/dummyFile"); fs.create(dummyFile).close(); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList(dummyFile.toUri().getPath()); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_15")); Path emptyDir = new Path(dir.toUri().getPath(), "emptyDir"); fs.mkdirs(emptyDir); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath()); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_16")); Path path1 = new Path(emptyDir, "path1"); fs.create(path1).close(); conf.hdfsUri = miniDFS.getURI().toString(); conf.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath()); clusterHdfsSource = createSource(conf); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); } finally { clusterHdfsSource.destroy(); } }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.TestClusterHDFSSource.java
License:Apache License
@Test public void testWrongHDFSDirLocation() throws Exception { ClusterHdfsDSource dSource = new ForTestClusterHdfsDSource(); configure(dSource, dir.toUri().getPath()); dSource.hdfsUri = "/pathwithnoschemeorauthority"; ClusterHdfsSource clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); try {/*w w w. j a v a 2s. com*/ List<ConfigIssue> issues = clusterHdfsSource.init(null, ContextInfoCreator .createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_02")); dSource.hdfsUri = "file://localhost:8020/"; clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_12")); dSource.hdfsUri = "hdfs:///noauthority"; clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_13")); dSource.hdfsUri = "hdfs://localhost:8020"; clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_11")); dSource.hdfsUri = miniDFS.getURI().toString(); dSource.hdfsDirLocations = Arrays.asList("/pathdoesnotexist"); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_10")); dSource.hdfsUri = miniDFS.getURI().toString(); dSource.hdfsDirLocations = Arrays.asList(dir.toUri().getPath()); FileSystem fs = miniDFS.getFileSystem(); Path someFile = new Path(new Path(dir.toUri()), "/someFile"); fs.create(someFile).close(); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); dSource.hdfsUri = null; dSource.hdfsConfigs.put(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, miniDFS.getURI().toString()); someFile = new Path(new Path(dir.toUri()), "/someFile2"); fs.create(someFile).close(); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); Path dummyFile = new Path(new Path(dir.toUri()), "/dummyFile"); fs.create(dummyFile).close(); dSource.hdfsUri = miniDFS.getURI().toString(); dSource.hdfsDirLocations = Arrays.asList(dummyFile.toUri().getPath()); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_15")); Path emptyDir = new Path(dir.toUri().getPath(), "emptyDir"); fs.mkdirs(emptyDir); dSource.hdfsUri = miniDFS.getURI().toString(); dSource.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath()); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 1, issues.size()); assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_16")); Path path1 = new Path(emptyDir, "path1"); fs.create(path1).close(); dSource.hdfsUri = miniDFS.getURI().toString(); dSource.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath()); clusterHdfsSource = (ClusterHdfsSource) dSource.createSource(); issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane"))); assertEquals(String.valueOf(issues), 0, issues.size()); } finally { clusterHdfsSource.destroy(); } }
From source file:com.talis.hadoop.rdf.merge.IndexMergeReducer.java
License:Apache License
@Override public void reduce(LongWritable key, Iterable<Text> value, final Context context) throws IOException, InterruptedException { Runnable reporter = new Runnable() { @Override//from ww w .ja v a 2s. co m public void run() { context.progress(); } }; ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); ScheduledFuture<?> task = scheduler.scheduleAtFixedRate(reporter, 60, 60, TimeUnit.SECONDS); LOG.debug("Scheduled progress reporter, combining index shards"); FileSystem shardsFs = null; for (Text remoteShard : value) { Path remote = new Path(remoteShard.toString()); if (null == shardsFs) { shardsFs = FileSystem.get(remote.toUri(), context.getConfiguration()); } LOG.debug("Copying shard from {} to {}", remote, localShards); shardsFs.copyToLocalFile(remote, localShards); LOG.debug("Copy complete"); } Directory[] shards = getDirectories(); LOG.debug("About to combine {} shards", shards.length); writer.addIndexesNoOptimize(shards); LOG.debug("Combined index built, terminating reporter"); task.cancel(true); }
From source file:com.talis.hadoop.rdf.RdfSolrJob.java
License:Apache License
private void writeShardManifest(String manifestLocation, String shardLocation, Configuration configuration) throws IOException { Path shardsPath = new Path(INTERMEDIATE_SHARDS_URI); FileSystem fs = FileSystem.get(shardsPath.toUri(), configuration); StringBuffer buf = new StringBuffer(); for (FileStatus status : fs.listStatus(shardsPath)) { LOG.info(status.getPath() + " : " + status.isDir()); if (status.isDir()) { buf.append(status.getPath()); buf.append("\n"); }//from w ww .j av a 2s.c om } FSDataOutputStream out = fs.create(new Path(manifestLocation)); out.write(buf.toString().getBytes()); out.flush(); out.close(); }
From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }//from w ww .ja v a2 s .co m boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { outputFs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); int shards = -1; boolean compressOutput = false; Path input = new Path(args[0]); Path output = new Path(args[1]); Path solrConfig = new Path(args[2]); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); if (shards > 0) { job.setNumReduceTasks(shards); } job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setReducerClass(SolrReducer.class); SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration()); job.setOutputFormatClass(SolrOutputFormat.class); String zipName = "solr.zip"; FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration); final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName); DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration()); job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString()); SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration()); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.talis.hadoop.rdf.ZipUtils.java
License:Apache License
/** * Write a file to a zip output stream, removing leading path name components * from the actual file name when creating the zip file entry. * // w ww . j ava 2 s .c o m * The entry placed in the zip file is <code>baseName</code>/ * <code>relativePath</code>, where <code>relativePath</code> is constructed * by removing a leading <code>root</code> from the path for * <code>itemToZip</code>. * * If <code>itemToZip</code> is an empty directory, it is ignored. If * <code>itemToZip</code> is a directory, the contents of the directory are * added recursively. * * @param zos The zip output stream * @param baseName The base name to use for the file name entry in the zip * file * @param root The path to remove from <code>itemToZip</code> to make a * relative path name * @param itemToZip The path to the file to be added to the zip file * @return the number of entries added * @throws IOException */ static public int zipDirectory(final Configuration conf, final ZipOutputStream zos, final String baseName, final String root, final Path itemToZip) throws IOException { LOG.info("zipDirectory: {} {} {}", new Object[] { baseName, root, itemToZip }); LocalFileSystem localFs = FileSystem.getLocal(conf); int count = 0; final FileStatus itemStatus = localFs.getFileStatus(itemToZip); if (itemStatus.isDir()) { final FileStatus[] statai = localFs.listStatus(itemToZip); // Add a directory entry to the zip file final String zipDirName = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root); final ZipEntry dirZipEntry = new ZipEntry(zipDirName + Path.SEPARATOR_CHAR); LOG.info(String.format("Adding directory %s to zip", zipDirName)); zos.putNextEntry(dirZipEntry); zos.closeEntry(); count++; if (statai == null || statai.length == 0) { LOG.info(String.format("Skipping empty directory %s", itemToZip)); return count; } for (FileStatus status : statai) { count += zipDirectory(conf, zos, baseName, root, status.getPath()); } LOG.info(String.format("Wrote %d entries for directory %s", count, itemToZip)); return count; } final String inZipPath = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root); if (inZipPath.length() == 0) { LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)", itemToZip, root, baseName)); return 0; } // Take empty files in case the place holder is needed FSDataInputStream in = null; try { in = localFs.open(itemToZip); final ZipEntry ze = new ZipEntry(inZipPath); ze.setTime(itemStatus.getModificationTime()); // Comments confuse looking at the zip file // ze.setComment(itemToZip.toString()); zos.putNextEntry(ze); IOUtils.copyBytes(in, zos, conf, false); zos.closeEntry(); LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip)); return 1; } finally { in.close(); } }
From source file:com.thinkbiganalytics.kylo.catalog.aws.S3FileSystemProvider.java
License:Apache License
@Nonnull @Override//w w w . j a v a 2s .c o m public List<DataSetFile> listFiles(@Nonnull final Path path, @Nonnull final Configuration conf) { // Determine the credentials final AmazonS3 s3; final URI uri = path.toUri(); if ("s3".equalsIgnoreCase(uri.getScheme()) || "s3bfs".equalsIgnoreCase(uri.getScheme()) || "s3n".equalsIgnoreCase(uri.getScheme())) { s3 = createS3Client(uri, conf); } else if ("s3a".equalsIgnoreCase(uri.getScheme())) { final Class<? extends S3ClientFactory> s3ClientFactoryClass = conf.getClass( Constants.S3_CLIENT_FACTORY_IMPL, Constants.DEFAULT_S3_CLIENT_FACTORY_IMPL, S3ClientFactory.class); try { s3 = ReflectionUtils.newInstance(s3ClientFactoryClass, conf).createS3Client(uri); } catch (final IOException e) { throw new IllegalArgumentException("Unable to create S3 client: " + e, e); } } else { log.debug("Scheme {} not supported for S3 path: {}", uri.getScheme(), path); throw new CatalogException("catalog.fs.s3.invalidScheme", uri.getScheme()); } // Fetch the list of buckets try { return s3.listBuckets().stream().map(bucket -> { final DataSetFile file = new DataSetFile(); file.setName(bucket.getName()); file.setDirectory(true); file.setModificationTime(bucket.getCreationDate().getTime()); file.setPath(uri.getScheme() + "://" + bucket.getName() + "/"); return file; }).collect(Collectors.toList()); } finally { s3.shutdown(); } }
From source file:com.thinkbiganalytics.kylo.catalog.aws.S3FileSystemProvider.java
License:Apache License
@Override public boolean supportsPath(@Nonnull final Path path) { final URI uri = path.toUri(); return (uri.getScheme() != null && (uri.getScheme().startsWith("s3") || uri.getScheme().startsWith("S3")) && path.toUri().getHost() == null); }