List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(Configuration conf) throws IOException
From source file:ca.uwaterloo.cs.bigdata2017w.assignment4.BuildPersonalizedPageRankRecords.java
License:Apache License
/** * Runs this tool./*from www .j ava 2 s .c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption( OptionBuilder.withArgName("sources").hasArg().withDescription("source nodes").create(SOURCES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); String sourcesString = cmdline.getOptionValue(SOURCES); String[] sources = sourcesString.split(","); for (int i = 0; i < sources.length; i++) { sources[i] = sources[i].trim(); } LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); LOG.info(" - use sources: " + sourcesString); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setStrings(SOURCES, sources); Job job = Job.getInstance(conf); job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPersonalizedPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java
License:Open Source License
public int run(String[] args) throws IOException { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>"); System.exit(2);/* w ww. j a v a2 s . c o m*/ } Job job1 = new Job(conf, "ConsineMain"); job1.setJarByClass(CosineMain.class); job1.setMapperClass(AggregateReadingsMapper.class); job1.setMapOutputKeyClass(LongWritable.class); job1.setMapOutputValueClass(DoubleWritable.class); job1.setReducerClass(AggregateReadingsReducer.class); job1.setOutputKeyClass(LongWritable.class); job1.setOutputValueClass(Text.class); FileInputFormat.setInputDirRecursive(job1, true); FileInputFormat.setInputPaths(job1, new Path(otherArgs[0])); int lastIdx = otherArgs[0].lastIndexOf("/"); String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp"; FileOutputFormat.setOutputPath(job1, new Path(tempOutput)); System.out.println("\nStarting Job-1 ..."); final long startTime = System.currentTimeMillis(); try { final long startTimeJob1 = System.currentTimeMillis(); if (!job1.waitForCompletion(true)) { System.out.println("Job-1 failed."); } else { System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0) + " seconds."); final Job job2 = new Job(conf, "ConsineMain Aggregate"); job2.setJarByClass(CosineMain.class); job2.setInputFormatClass(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput); CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput); FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1])); job2.setMapperClass(CartesianProductMapper.class); job2.setMapOutputKeyClass(DoubleWritable.class); job2.setMapOutputValueClass(Text.class); job2.setSortComparatorClass(DescendingKeyComparator.class); job2.setReducerClass(CartesianProductReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(DoubleWritable.class); job2.setNumReduceTasks(10); final long startTimeJob2 = System.currentTimeMillis(); System.out.println("\nStarting Job-2 ..."); if (!job2.waitForCompletion(true)) { System.out.println("Job-2 failed."); } else { System.out.println("Duration of Job2: " + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds."); } } FileSystem fs = FileSystem.get(conf); fs.delete(new Path(tempOutput), true); } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Total Duration: " + duration + " seconds."); } return 0; }
From source file:cascading.ClusterTestCase.java
License:Open Source License
public FileSystem getFileSystem() throws IOException { if (fileSys != null) return fileSys; return FileSystem.get(jobConf); }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
public static FileSystem getDefaultFS(Configuration config) { try {/*from w ww . j a va 2s .c om*/ return FileSystem.get(config); } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
private static Path getWorkingDirectory(Configuration conf) { String name = conf.get("mapred.working.dir"); if (name != null) { return new Path(name); } else {/*from w ww . j a va 2 s .c o m*/ try { Path dir = FileSystem.get(conf).getWorkingDirectory(); conf.set("mapred.working.dir", dir.toString()); return dir; } catch (IOException e) { throw new RuntimeException(e); } } }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
private static void setWorkingDirectory(Configuration conf) { String name = conf.get(JobContext.WORKING_DIR); if (name != null) return;// ww w . ja va2s . co m try { Path dir = FileSystem.get(conf).getWorkingDirectory(); conf.set(JobContext.WORKING_DIR, dir.toString()); } catch (IOException exception) { throw new RuntimeException(exception); } }
From source file:cascading.flow.tez.planner.Hadoop2TezFlowStepJob.java
License:Open Source License
private Path prepareEnsureStagingDir(TezConfiguration workingConf) throws IOException { String stepStagingPath = createStepStagingPath(); workingConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stepStagingPath); Path stagingDir = new Path(stepStagingPath); FileSystem fileSystem = FileSystem.get(workingConf); stagingDir = fileSystem.makeQualified(stagingDir); TokenCache.obtainTokensForNamenodes(new Credentials(), new Path[] { stagingDir }, workingConf); TezClientUtils.ensureStagingDirExists(workingConf, stagingDir); if (fileSystem.getScheme().startsWith("file:/")) new File(stagingDir.toUri()).mkdirs(); return stagingDir; }
From source file:cascading.hive.HivePartitionDemo.java
License:Open Source License
public static void main(String[] args) throws Exception { Properties properties = new Properties(); AppProps.setApplicationName(properties, "cascading hive partitioning demo"); JobConf jobConf = new JobConf(); FileSystem fs = FileSystem.get(jobConf); fs.copyFromLocalFile(false, true, new Path(accessLog), new Path("/tmp/access.log")); String[] columnNames = new String[] { "ts", "customer", "bucket", "operation", "key", "region" }; String[] columnTypes = new String[] { "timestamp", "string", "string", "string", "string", "string" }; String[] partitionKeys = new String[] { "region" }; HiveTableDescriptor partitionedDescriptor = new HiveTableDescriptor("mydb", "mytable", columnNames, columnTypes, partitionKeys, "\t"); HiveTap hiveTap = new HiveTap(partitionedDescriptor, partitionedDescriptor.toScheme()); Tap partitionTap = new HivePartitionTap(hiveTap); Fields allFields = new Fields(columnNames); Tap input = new Hfs(new TextDelimited(allFields), "hdfs:/tmp/access.log"); class Echo extends BaseOperation implements Function { public Echo(Fields fieldDeclaration) { super(2, fieldDeclaration); }//from w w w .jav a2 s . c om @Override public void operate(FlowProcess flowProcess, FunctionCall functionCall) { TupleEntry argument = functionCall.getArguments(); functionCall.getOutputCollector().add(argument.getTuple()); } } Pipe pipe = new Each(" import ", allFields, new Echo(allFields), Fields.RESULTS); Flow flow = new HadoopFlowConnector().connect(input, partitionTap, pipe); flow.complete(); Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); Connection con = DriverManager.getConnection("jdbc:hive://", "", ""); Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("select * from mydb.mytable where region = 'ASIA' "); String[] names = partitionedDescriptor.getColumnNames(); System.out.println("----------------------Hive JDBC--------------------------"); while (rs.next()) { StringBuffer buf = new StringBuffer("JDBC>>> "); for (int i = 0; i < names.length; i++) { String name = names[i]; buf.append(name).append("=").append(rs.getObject(i + 1)).append(", "); } System.out.println(buf.toString()); } System.out.println("---------------------------------------------------------"); stmt.close(); con.close(); // do the same as the JDBC above, but in Cascading. class RegionFilter extends BaseOperation implements Filter { final String region; public RegionFilter(String region) { this.region = region; } @Override public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) { if (filterCall.getArguments().getString("region").equals(this.region)) return false; return true; } } Tap requestsInAsiaSink = new Hfs(new TextDelimited(allFields), "hdfs:/tmp/requests-from-asia", SinkMode.REPLACE); Pipe headPipe = new Each("requests from ASIA", allFields, new RegionFilter("ASIA")); Flow headFlow = new HadoopFlowConnector().connect(partitionTap, requestsInAsiaSink, headPipe); headFlow.complete(); TupleEntryIterator tupleEntryIterator = requestsInAsiaSink.openForRead(headFlow.getFlowProcess()); while (tupleEntryIterator.hasNext()) { TupleEntry tupleEntry = tupleEntryIterator.next(); System.out.println("Cascading>>> " + tupleEntry); } tupleEntryIterator.close(); }
From source file:cascading.hive.HiveViewDemo.java
License:Open Source License
public static void main(String[] args) throws Exception { Properties properties = new Properties(); AppProps.setApplicationName(properties, "cascading hive partitioning demo"); JobConf jobConf = new JobConf(); FileSystem fs = FileSystem.get(jobConf); fs.copyFromLocalFile(false, true, new Path(accessLog), new Path("/tmp/access.log")); String[] columnNames = new String[] { "ts", "customer", "bucket", "operation", "key", "region" }; String[] columnTypes = new String[] { "timestamp", "string", "string", "string", "string", "string" }; String[] partitionKeys = new String[] { "region" }; HiveTableDescriptor partitionedDescriptor = new HiveTableDescriptor("mydb2", "mytable2", columnNames, columnTypes, partitionKeys, "\t"); HiveTap hiveTap = new HiveTap(partitionedDescriptor, partitionedDescriptor.toScheme()); Tap outputTap = new HivePartitionTap(hiveTap); class Echo extends BaseOperation implements Function { public Echo(Fields fieldDeclaration) { super(2, fieldDeclaration); }/*from w ww . j a v a 2 s .c o m*/ @Override public void operate(FlowProcess flowProcess, FunctionCall functionCall) { TupleEntry argument = functionCall.getArguments(); functionCall.getOutputCollector().add(argument.getTuple()); } } Fields allFields = new Fields(columnNames); Pipe pipe = new Each(" echo ", allFields, new Echo(allFields), Fields.RESULTS); Tap input = new Hfs(new TextDelimited(allFields), "hdfs:/tmp/access.log"); Flow flow = new HadoopFlowConnector().connect(input, outputTap, pipe); flow.complete(); String viewSelect = "select distinct customer from mydb2.mytable2 where region = 'ASIA'"; String viewDef = "create or replace view customers_in_asia as " + viewSelect; HiveViewAnalyzer analyzer = new HiveViewAnalyzer(); Collection<Tap> inputs = analyzer.asTaps(viewSelect); HiveFlow viewflow = new HiveFlow("create view", viewDef, inputs); viewflow.complete(); Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); Connection con = DriverManager.getConnection("jdbc:hive://", "", ""); Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("select * from customers_in_asia "); System.out.println("----------------------Hive JDBC--------------------------"); while (rs.next()) System.out.println("customer=" + rs.getString(1)); System.out.println("---------------------------------------------------------"); stmt.close(); con.close(); }
From source file:cascading.platform.hadoop.HadoopPlatform.java
License:Open Source License
@Override public synchronized void setUp() throws IOException { if (configuration != null) return;/* www . ja va2s .c o m*/ if (!isUseCluster()) { LOG.info("not using cluster"); configuration = new JobConf(); // enforce the local file system in local mode configuration.set("fs.default.name", "file:///"); configuration.set("mapred.job.tracker", "local"); configuration.set("mapreduce.jobtracker.staging.root.dir", System.getProperty("user.dir") + "/build/tmp/cascading/staging"); String stagingDir = configuration.get("mapreduce.jobtracker.staging.root.dir"); if (Util.isEmpty(stagingDir)) configuration.set("mapreduce.jobtracker.staging.root.dir", System.getProperty("user.dir") + "/build/tmp/cascading/staging"); fileSys = FileSystem.get(configuration); } else { LOG.info("using cluster"); if (Util.isEmpty(System.getProperty("hadoop.log.dir"))) System.setProperty("hadoop.log.dir", "cascading-hadoop/build/test/log"); if (Util.isEmpty(System.getProperty("hadoop.tmp.dir"))) System.setProperty("hadoop.tmp.dir", "cascading-hadoop/build/test/tmp"); new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored JobConf conf = new JobConf(); if (!Util.isEmpty(System.getProperty("mapred.jar"))) { LOG.info("using a remote cluster with jar: {}", System.getProperty("mapred.jar")); configuration = conf; ((JobConf) configuration).setJar(System.getProperty("mapred.jar")); if (!Util.isEmpty(System.getProperty("fs.default.name"))) { LOG.info("using {}={}", "fs.default.name", System.getProperty("fs.default.name")); configuration.set("fs.default.name", System.getProperty("fs.default.name")); } if (!Util.isEmpty(System.getProperty("mapred.job.tracker"))) { LOG.info("using {}={}", "mapred.job.tracker", System.getProperty("mapred.job.tracker")); configuration.set("mapred.job.tracker", System.getProperty("mapred.job.tracker")); } configuration.set("mapreduce.user.classpath.first", "true"); // use test dependencies fileSys = FileSystem.get(configuration); } else { dfs = new MiniDFSCluster(conf, 4, true, null); fileSys = dfs.getFileSystem(); mr = new MiniMRCluster(4, fileSys.getUri().toString(), 1, null, null, conf); configuration = mr.createJobConf(); } // jobConf.set( "mapred.map.max.attempts", "1" ); // jobConf.set( "mapred.reduce.max.attempts", "1" ); configuration.set("mapred.child.java.opts", "-Xmx512m"); configuration.setInt("mapred.job.reuse.jvm.num.tasks", -1); configuration.setInt("jobclient.completion.poll.interval", 50); configuration.setInt("jobclient.progress.monitor.poll.interval", 50); ((JobConf) configuration).setMapSpeculativeExecution(false); ((JobConf) configuration).setReduceSpeculativeExecution(false); } ((JobConf) configuration).setNumMapTasks(numMappers); ((JobConf) configuration).setNumReduceTasks(numReducers); Map<Object, Object> globalProperties = getGlobalProperties(); if (logger != null) globalProperties.put("log4j.logger", logger); FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests HadoopPlanner.copyProperties((JobConf) configuration, globalProperties); // copy any external properties HadoopPlanner.copyJobConf(properties, (JobConf) configuration); // put all properties on the jobconf }