List of usage examples for org.apache.hadoop.fs FileSystem create
public FSDataOutputStream create(Path f, short replication) throws IOException
From source file:com.cloudera.knittingboar.sgd.POLRMasterDriver.java
License:Apache License
/** * [ needs to be checked ]//from w w w. ja va2 s .c o m * * NOTE: This should only be used for durability purposes in checkpointing the * workers * * @param outputFilename * @param conf * @throws Exception */ public void SaveModelToHDFS(String outputFilename, Configuration conf) throws Exception { Path path = new Path(outputFilename); FileSystem fs = path.getFileSystem(conf); FSDataOutputStream modelHDFSOutput = fs.create(path, true); try { polr_modelparams.saveTo(modelHDFSOutput); } finally { modelHDFSOutput.close(); } }
From source file:com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java
License:Open Source License
public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { // THIS IS WHERE THE MAGIC HAPPENS!!! // Convert CSV into Avro!!!! SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); String headerRowHash = new String(sd.getPayload()); CSVRowParser rowParser = new CSVRowParser(schema, headerRowHash); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); int numRecords = 0; int MAX_RECORDS = 1000; try {//from w ww. jav a 2 s . c om BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename()))); try { String rowStr = null; while (((rowStr = in.readLine()) != null) && (numRecords < MAX_RECORDS)) { if (("" + rowStr.hashCode()).compareTo(headerRowHash) == 0) { continue; } GenericData.Record record = rowParser.parseRow(rowStr); if (record == null) { continue; } if (record.getSchema().toString().hashCode() != schema.toString().hashCode()) { continue; } dataFileWriter.append(record); numRecords++; } } finally { in.close(); } } finally { dataFileWriter.close(); } }
From source file:com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java
License:Open Source License
public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); try {/* w w w. ja v a2 s. c om*/ BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename()))); try { String rowStr = null; GenericData.Record rowRecord = null; while ((rowStr = in.readLine()) != null) { for (int i = 0; i < regexps.size(); i++) { Pattern curPattern = regexps.get(i); Schema curSchema = localschemas.get(i); Matcher curMatcher = curPattern.matcher(rowStr); if (curMatcher.find()) { // Create Avro record here rowRecord = new GenericData.Record(curSchema); List<Schema.Field> curFields = curSchema.getFields(); for (int j = 0; j < curMatcher.groupCount(); j++) { Schema.Field curField = curFields.get(j); String fieldName = curField.name(); Schema fieldType = curField.schema(); String rawFieldValue = curMatcher.group(j + 1); Object fieldValue = null; if (fieldType.getType() == Schema.Type.INT) { fieldValue = Integer.parseInt(rawFieldValue); } else if (fieldType.getType() == Schema.Type.FLOAT) { fieldValue = Float.parseFloat(rawFieldValue); } else if (fieldType.getType() == Schema.Type.STRING) { fieldValue = rawFieldValue; } if (fieldValue != null) { rowRecord.put(fieldName, fieldValue); } } if (rowRecord.getSchema().toString().hashCode() == schema.toString().hashCode()) { dataFileWriter.append(rowRecord); } } } } } finally { in.close(); } } finally { dataFileWriter.close(); } }
From source file:com.cloudera.recordbreaker.analyzer.UnknownTextDataDescriptor.java
License:Open Source License
public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); int numRecords = 0; int MAX_RECORDS = 1000; try {// www . ja va 2 s.com for (Iterator it = sd.getIterator(); it.hasNext() && numRecords < MAX_RECORDS;) { GenericData.Record rowRecord = (GenericData.Record) it.next(); if (rowRecord.getSchema().toString().hashCode() != schema.toString().hashCode()) { continue; } dataFileWriter.append(rowRecord); numRecords++; } } finally { dataFileWriter.close(); } }
From source file:com.cloudera.sqoop.mapreduce.RawKeyTextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException { boolean isCompressed = getCompressOutput(context); Configuration conf = context.getConfiguration(); String ext = ""; CompressionCodec codec = null;// w w w . j a va2s . c om if (isCompressed) { // create the named codec Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); ext = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(context, ext); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); DataOutputStream ostream = fileOut; if (isCompressed) { ostream = new DataOutputStream(codec.createOutputStream(fileOut)); } return new RawKeyRecordWriter<K, V>(ostream); }
From source file:com.conversantmedia.mapreduce.output.BloomFilterOutputFormat.java
License:Apache License
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { if (writer == null) { int size = getExpectedInsertions(job); checkState(size > 0, "Expected insertion insertionSize not set."); Configuration conf = job.getConfiguration(); String extension = ""; Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); writer = new BloomFilterRecordWriter<>(fileOut, size); }/* w ww. j a v a 2s. co m*/ return writer; }
From source file:com.cotdp.hadoop.ZipFileTest.java
License:Apache License
/** * Simple utility function to copy files into HDFS * /*from w ww .j av a2 s . co m*/ * @param fs * @param name * @throws IOException */ private void copyFile(FileSystem fs, String name) throws IOException { LOG.info("copyFile: " + name); InputStream is = this.getClass().getResourceAsStream("/" + name); OutputStream os = fs.create(new Path(inputPath, name), true); IOUtils.copyBytes(is, os, conf); os.close(); is.close(); }
From source file:com.datasalt.pangool.utils.HadoopUtils.java
License:Apache License
/** * Creates a file with the given string, overwritting if needed. *//*from ww w .j a v a2s . co m*/ public static void stringToFile(FileSystem fs, Path path, String string) throws IOException { OutputStream os = fs.create(path, true); PrintWriter pw = new PrintWriter(os); pw.append(string); pw.close(); }
From source file:com.datatorrent.stram.StramClient.java
License:Apache License
/** * Launch application for the dag represented by this client. * * @throws YarnException// w ww. j a v a 2s . c om * @throws IOException */ public void startApplication() throws YarnException, IOException { Class<?>[] defaultClasses; if (applicationType.equals(YARN_APPLICATION_TYPE)) { //TODO restrict the security check to only check if security is enabled for webservices. if (UserGroupInformation.isSecurityEnabled()) { defaultClasses = DATATORRENT_SECURITY_CLASSES; } else { defaultClasses = DATATORRENT_CLASSES; } } else { throw new IllegalStateException(applicationType + " is not a valid application type."); } LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses); if (resources != null) { localJarFiles.addAll(resources); } YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); //GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class); //GetClusterNodesResponse clusterNodesResp = rmClient.clientRM.getClusterNodes(clusterNodesReq); //LOG.info("Got Cluster node info from ASM"); //for (NodeReport node : clusterNodesResp.getNodeReports()) { // LOG.info("Got node report from ASM for" // + ", nodeId=" + node.getNodeId() // + ", nodeAddress" + node.getHttpAddress() // + ", nodeRackName" + node.getRackName() // + ", nodeNumContainers" + node.getNumContainers() // + ", nodeHealthStatus" + node.getHealthReport()); //} List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication newApp = yarnClient.createApplication(); appId = newApp.getNewApplicationResponse().getApplicationId(); // Dump out information about cluster capability as seen by the resource manager int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); int amMemory = dag.getMasterMemoryMB(); if (amMemory > maxMem) { LOG.info("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) { dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString()); } // Create launch context for app master LOG.info("Setting up application submission context for ASM"); ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); // set the application id appContext.setApplicationId(appId); // set the application name appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME)); appContext.setApplicationType(this.applicationType); if (YARN_APPLICATION_TYPE.equals(this.applicationType)) { //appContext.setMaxAppAttempts(1); // no retries until Stram is HA } // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // Setup security tokens // If security is enabled get ResourceManager and NameNode delegation tokens. // Set these tokens on the container so that they are sent as part of application submission. // This also sets them up for renewal by ResourceManager. The NameNode delegation rmToken // is also used by ResourceManager to fetch the jars from HDFS and set them up for the // application master launch. if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } } finally { fs.close(); } addRMDelegationToken(tokenRenewer, credentials); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); // copy required jar files to dfs, to be localized for containers FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { Path appsBasePath = new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS); Path appPath = new Path(appsBasePath, appId.toString()); String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {})); LOG.info("libjars: {}", libJarsCsv); dag.getAttributes().put(LogicalPlan.LIBRARY_JARS, libJarsCsv); LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.FILE, libJarsCsv, localResources, fs); if (archives != null) { String[] localFiles = archives.split(","); String archivesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("archives: {}", archivesCsv); dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv); LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.ARCHIVE, archivesCsv, localResources, fs); } if (files != null) { String[] localFiles = files.split(","); String filesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("files: {}", filesCsv); dag.getAttributes().put(LogicalPlan.FILES, filesCsv); LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.FILE, filesCsv, localResources, fs); } dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString()); if (dag.getAttributes() .get(OperatorContext.STORAGE_AGENT) == null) { /* which would be the most likely case */ Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS); // use conf client side to pickup any proxy settings from dt-site.xml dag.setAttribute(OperatorContext.STORAGE_AGENT, new FSStorageAgent(checkpointPath.toString(), conf)); } if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) { dag.setAttribute(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator()); } // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(appPath, "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } if (originalAppId != null) { Path origAppPath = new Path(appsBasePath, this.originalAppId); LOG.info("Restart from {}", origAppPath); copyInitialState(origAppPath); } // push logical plan to DFS location Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME); FSDataOutputStream outStream = fs.create(cfgDst, true); LogicalPlan.write(this.dag, outStream); outStream.close(); Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME); outStream = fs.create(launchConfigDst, true); conf.writeXml(outStream); outStream.close(); FileStatus topologyFileStatus = fs.getFileStatus(cfgDst); LocalResource topologyRsrc = Records.newRecord(LocalResource.class); topologyRsrc.setType(LocalResourceType.FILE); topologyRsrc.setVisibility(LocalResourceVisibility.APPLICATION); topologyRsrc.setResource(ConverterUtils.getYarnUrlFromURI(cfgDst.toUri())); topologyRsrc.setTimestamp(topologyFileStatus.getModificationTime()); topologyRsrc.setSize(topologyFileStatus.getLen()); localResources.put(LogicalPlan.SER_FILE_NAME, topologyRsrc); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed //amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // Add application jar(s) location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar(s) // including ${CLASSPATH} will duplicate the class path in app master, removing it for now //StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*"); StringBuilder classPathEnv = new StringBuilder("./*"); String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH); for (String c : StringUtils.isBlank(classpath) ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH : classpath.split(",")) { if (c.equals("$HADOOP_CLIENT_CONF_DIR")) { // SPOI-2501 continue; } classPathEnv.append(':'); classPathEnv.append(c.trim()); } env.put("CLASSPATH", classPathEnv.toString()); // propagate to replace node managers user name (effective in non-secure mode) env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master ArrayList<CharSequence> vargs = new ArrayList<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(javaCmd); if (dag.isDebug()) { vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n"); } // Set Xmx based on am memory size // default heap size 75% of total memory if (dag.getMasterJVMOptions() != null) { vargs.add(dag.getMasterJVMOptions()); } vargs.add("-Xmx" + (amMemory * 3 / 4) + "m"); vargs.add("-XX:+HeapDumpOnOutOfMemoryError"); vargs.add("-XX:HeapDumpPath=/tmp/dt-heap-" + appId.getId() + ".bin"); vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA"); vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR); vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath())); if (dag.isDebug()) { vargs.add("-Dlog4j.debug=true"); } String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL); if (loggersLevel != null) { vargs.add(String.format("-D%s=%s", DTLoggerFactory.DT_LOGGERS_LEVEL, loggersLevel)); } vargs.add(StreamingAppMaster.class.getName()); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final command StringBuilder command = new StringBuilder(9 * vargs.size()); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(queueName); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure String specStr = Objects.toStringHelper("Submitting application: ") .add("name", appContext.getApplicationName()).add("queue", appContext.getQueue()) .add("user", UserGroupInformation.getLoginUser()).add("resource", appContext.getResource()) .toString(); LOG.info(specStr); if (dag.isDebug()) { //LOG.info("Full submission context: " + appContext); } yarnClient.submitApplication(appContext); } finally { fs.close(); } }
From source file:com.datatorrent.stram.util.FSUtil.java
License:Apache License
/** * Copied from FileUtil to transfer ownership * * @param srcFS/*from ww w .j a va2 s .c o m*/ * @param srcStatus * @param dstFS * @param dst * @param deleteSource * @param overwrite * @param conf * @return * @throws IOException */ public static boolean copy(FileSystem srcFS, FileStatus srcStatus, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, Configuration conf) throws IOException { Path src = srcStatus.getPath(); //dst = checkDest(src.getName(), dstFS, dst, overwrite); if (srcStatus.isDirectory()) { //checkDependencies(srcFS, src, dstFS, dst); if (!mkdirs(dstFS, dst)) { return false; } FileStatus contents[] = srcFS.listStatus(src); for (int i = 0; i < contents.length; i++) { copy(srcFS, contents[i], dstFS, new Path(dst, contents[i].getPath().getName()), deleteSource, overwrite, conf); } } else { InputStream in = null; OutputStream out = null; try { in = srcFS.open(src); out = dstFS.create(dst, overwrite); org.apache.hadoop.io.IOUtils.copyBytes(in, out, conf, true); } catch (IOException e) { org.apache.hadoop.io.IOUtils.closeStream(out); org.apache.hadoop.io.IOUtils.closeStream(in); throw e; } } // TODO: change group and limit write to group if (srcStatus.isDirectory()) { dstFS.setPermission(dst, new FsPermission((short) 0777)); } else { dstFS.setPermission(dst, new FsPermission((short) 0777)/*"ugo+w"*/); } //dstFS.setOwner(dst, null, srcStatus.getGroup()); /* try { // transfer owner // DOES NOT WORK only super user can change file owner dstFS.setOwner(dst, srcStatus.getOwner(), srcStatus.getGroup()); } catch (IOException e) { LOG.warn("Failed to change owner on {} to {}", dst, srcStatus.getOwner(), e); throw e; } */ if (deleteSource) { return srcFS.delete(src, true); } else { return true; } }