List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:Vectors.java
License:Apache License
public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); FSDataInputStream in = fs.open(path); try {/*from w ww .j a va2 s. com*/ return readAsIntMap(in); } finally { Closeables.closeQuietly(in); } }
From source file:Vectors.java
License:Apache License
public static Vector read(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); FSDataInputStream in = fs.open(path); try {/* www. j a v a 2 s .com*/ return VectorWritable.readVector(in); } finally { Closeables.closeQuietly(in); } }
From source file:BwaInterpreter.java
License:Open Source License
private void combineOutputSamFiles(String outputHdfsDir, List<String> returnedValues) { try {// ww w .ja v a 2 s . co m Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path finalHdfsOutputFile = new Path(outputHdfsDir + "/FullOutput.sam"); FSDataOutputStream outputFinalStream = fs.create(finalHdfsOutputFile, true); // We iterate over the resulting files in HDFS and agregate them into only one file. for (int i = 0; i < returnedValues.size(); i++) { LOG.info("JMAbuin:: SparkBWA :: Returned file ::" + returnedValues.get(i)); BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path(returnedValues.get(i))))); String line; line = br.readLine(); while (line != null) { if (i == 0 || !line.startsWith("@")) { //outputFinalStream.writeBytes(line+"\n"); outputFinalStream.write((line + "\n").getBytes()); } line = br.readLine(); } br.close(); fs.delete(new Path(returnedValues.get(i)), true); } outputFinalStream.close(); fs.close(); } catch (IOException e) { e.printStackTrace(); LOG.error(e.toString()); } }
From source file:BwaInterpreter.java
License:Open Source License
/** * Used to perform the sort operation in HDFS * @brief This function provides a method to perform the sort phase in HDFS * @author Jos M. Abun//from ww w.ja va 2 s .c o m * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS * @return A JavaRDD that contains the paired reads sorted */ public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) { Configuration conf = this.conf; LOG.info("JMAbuin:: Starting writing reads to HDFS"); try { FileSystem fs = FileSystem.get(conf); Path outputFilePath = new Path(this.inputTmpFileName); //To write the paired reads FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true); //To read paired reads from both files BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1)))); BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2)))); String lineFastq1; String lineFastq2; lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); //Loop to read two files. The two of them must have the same line numbers while (lineFastq1 != null) { //The lines are written interspersed outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes()); //Next lines are readed lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); } //Close the input and output files brFastqFile1.close(); brFastqFile2.close(); outputFinalStream.close(); //Now it is time to read the previous created file and create the RDD ContentSummary cSummary = fs.getContentSummary(outputFilePath); long length = cSummary.getLength(); this.totalInputLength = length; fs.close(); //In case of the user does want partitioning if (this.options.getPartitionNumber() != 0) { //These options are set to indicate the split size and get the correct vnumber of partitions this.conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / this.options.getPartitionNumber())); this.conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / this.options.getPartitionNumber())); LOG.info("JMAbuin partitioning from HDFS:: " + String.valueOf((length) / this.options.getPartitionNumber())); //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true); } else { //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).map(new BigFastq2RDDDouble()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); return null; } }
From source file:ParascaleFsTestCase.java
License:Apache License
public static String getMD5Checksum(final Path aPath, final FileSystem aFileSystem) throws Exception { final byte[] b = md5sum(aFileSystem.open(aPath)); return byteToHex(b); }
From source file:JavaCustomReceiver.java
License:Apache License
/** Create a socket connection and receive data until receiver is stopped */ private void receive() { Socket socket = null;/*from w ww . j av a 2 s .co m*/ String userInput = null; try { // connect to the server socket = new Socket(host, port); // BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream())); // Path pt=new Path("hdfs://192.168.0.1:9000/equinox-sanjose.20120119-netflow.txt"); // FileSystem fs = FileSystem.get(new Configuration()); // BufferedReader in=new BufferedReader(new InputStreamReader(fs.open(pt))); Path pt = new Path("hdfs://192.168.0.1:9000/user/hduser/equinox-sanjose.20120119-netflow.txt"); Configuration conf = new Configuration(); conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml")); conf.addResource(new Path("/usr/local/hadoop/conf/hdfs-site.xml")); // FileSystem fs = FileSystem.get(conf); FileSystem fs = pt.getFileSystem(conf); System.out.println(fs.getHomeDirectory()); BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(pt))); // BufferedReader in = new BufferedReader( // new FileReader( // "/home/hduser/spark_scratchPad/equinox-sanjose.20120119-netflow.txt")); // // Until stopped or connection broken continue reading while (!isStopped() && (userInput = in.readLine()) != null) { System.out.println("Received data '" + userInput + "'"); store(userInput); } in.close(); socket.close(); // Restart in an attempt to connect again when server is active again restart("Trying to connect again"); } catch (ConnectException ce) { // restart if could not connect to server restart("Could not connect", ce); } catch (Throwable t) { restart("Error receiving data", t); } }
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(//from ww w . j a v a 2s . co m "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:ZipFileRecordReader.java
License:Apache License
/** * Initialise and open the ZIP file from the FileSystem *///from www. j a v a 2s.c o m @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration conf = taskAttemptContext.getConfiguration(); Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); // Open the stream fsin = fs.open(path); zip = new ZipInputStream(fsin); }
From source file:ExtractTopPersonalizedPageRankNodes.java
License:Apache License
/** * Runs this tool.// w ww. ja v a2s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("top n").create(TOP)); options.addOption(OptionBuilder.withArgName("src").hasArg().withDescription("source node").create(SRC)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(TOP)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = "abc";//cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(TOP)); //LOG.info("Tool name: " + ExtractTopPersonalizedPageRankNodes.class.getSimpleName()); //LOG.info(" - input: " + inputPath); //LOG.info(" - output: " + outputPath); //LOG.info(" - top: " + n); Configuration conf = getConf(); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt(TOP_PG, n); Job job = Job.getInstance(conf); job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName() + ":" + inputPath); job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfIntFloat.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setPartitionerClass(MyPartitioner.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(outputPath + "/part-r-00000"); ; //MapFile.Reader reader = new MapFile.Reader(new Path(outputPath+ "/part-r-00000"),conf); // InputStream fis=new FileInputStream(outputPath+"/part-r-00000"); BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(path))); String s; float key;//=new FloatWritable(); int value;//=new IntWritable(); while ((s = br.readLine()) != null) { String[] sources = s.split("\\s+"); key = Float.parseFloat(sources[0]); value = Integer.parseInt(sources[1]); if (key == 0.0f) { System.out.print("\n" + "Source: " + value + "\n"); } else { System.out.print(String.format("%.5f %d", key, value) + "\n"); } } //reader.close(); br.close(); //while(!SysOut.isEmpty()) //{ // System.out.print(SysOut.poll()); //} return 0; }
From source file:SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { FSDataInputStream din = fs.open(path); din.seek(start);/*from ww w . ja va 2 s. com*/ return new SeekableInputStream(din); }