List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java
License:Apache License
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); Path output = new Path(params.get("output", "output.txt")); Path input = new Path(params.get("input")); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); Charset encoding = Charset.forName(params.get("encoding")); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, TopKStringPatterns.class); FSDataInputStream inputStream = null; FSDataInputStream inputStreamAgain = null; Collection<String> features = Sets.newHashSet(); if ("true".equals(params.get(PFPGrowth.USE_FPG2))) { com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String> fp = new com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String>(); try {//from ww w. j a v a2 s .c o m inputStream = fs.open(input); inputStreamAgain = fs.open(input); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } else { FPGrowth<String> fp = new FPGrowth<String>(); inputStream = fs.open(input); inputStreamAgain = fs.open(input); try { fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, output); for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond()); } }
From source file:com.chinamobile.bcbsp.bspstaff.BSPStaff.java
License:Apache License
private void readMigratePartition(StaffSSControllerInterface sssc, int currentSuperStepCounter) throws IOException { BufferedReader br = null;/*w ww . ja v a 2 s . c om*/ Path migratePartitionPath = new Path(migratePartitionDir); FileSystem fsFileSystem = FileSystem.get(this.getConf().getConf()); FileStatus[] fs = fsFileSystem.listStatus(migratePartitionPath); Path[] listPath = FileUtil.stat2Paths(fs); for (Path p : listPath) { FSDataInputStream fsInput = fsFileSystem.open(p); br = new BufferedReader(new InputStreamReader(fsInput)); String line = null; while (null != (line = br.readLine())) { String[] strs = line.split(":"); this.partitioner.updateMigratePartition(new Text(strs[0]), Integer.parseInt(strs[1])); } } }
From source file:com.chinnu.churndetection.fuzzykmeans.FuzzyKMeansReducer.java
@Override protected void reduce(IntWritable key, Iterable<Vector> values, Reducer<IntWritable, Vector, IntWritable, Text>.Context context) throws IOException, InterruptedException { double[] sum = new double[DATALENGTH]; for (int i = 0; i < DATALENGTH; i++) { sum[i] = 0;//from www . j a v a 2 s .c o m } int count = 0; for (Vector vector : values) { for (int i = 0; i < DATALENGTH; i++) { sum[i] += vector.getData()[i]; } count++; Text text = new Text(vector.toString()); context.write(key, text); } double[] newCenter = new double[DATALENGTH]; for (int i = 0; i < DATALENGTH; i++) { newCenter[i] = sum[i] / count; } Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); List<double[]> curr_center = new ArrayList<>(); String[] lineSplit = CURR_CENTER.split("\n"); for (int j = 0; j < lineSplit.length; j++) { String line = lineSplit[j]; String[] split = line.split(","); double[] temp = new double[split.length]; for (int i = 0; i < split.length; i++) { temp[i] = Double.parseDouble(split[i]); } curr_center.add(temp); } List<String> appendLine = new ArrayList<>(); if (fs.exists(new Path(NEW_CENTER))) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(NEW_CENTER)))); String line; while ((line = br.readLine()) != null) { appendLine.add(line); } } PrintWriter pw = new PrintWriter(new OutputStreamWriter(fs.create(new Path(NEW_CENTER), true))); for (String string : appendLine) { pw.println(string); pw.flush(); } String line = ""; for (int i = 0; i < DATALENGTH; i++) { line += newCenter[i] + ","; } String substring = line.substring(0, line.length() - 1); pw.println(substring); pw.flush(); pw.close(); MRLogger.Log(context.getJobName()); MRLogger.Log(Arrays.toString(curr_center.get(key.get()))); MRLogger.Log(Arrays.toString(newCenter)); double curr_Distance = DistanceComparator.findDistance(curr_center.get(key.get()), newCenter); MRLogger.Log(curr_Distance + ""); if (curr_Distance < 0.01) { PrintWriter pw1 = new PrintWriter( new OutputStreamWriter(fs.create(new Path(ChurnDriver.CENTER_CONVERGED), true))); pw1.println("converged"); pw1.flush(); pw1.close(); } }
From source file:com.cip.crane.agent.utils.TaskHelper.java
License:Open Source License
@SuppressWarnings("unused") private void readFileFromHdfs(String srcFile, String destFile) throws IOException, FileNotFoundException { File file = new File(destFile); if (file.exists()) { file.delete();/*from w ww .j ava 2s . c o m*/ } byte[] buf = new byte[BUFFER_SIZE]; FileOutputStream fos = new FileOutputStream(file); FileSystem fs; FSDataInputStream hdfsInput; try { fs = FileSystem.get(URI.create(srcFile), conf); hdfsInput = fs.open(new Path(srcFile)); int num = hdfsInput.read(buf); while (num != (-1)) {// ? fos.write(buf, 0, num);// ? fos.flush();// ? num = hdfsInput.read(buf);// ?? } hdfsInput.close(); fos.close(); fs.close(); } catch (IOException e) { if (file.exists()) { file.delete(); } throw e; } }
From source file:com.ckelsel.hadoop.dfs.Test.Test.java
License:Open Source License
public static void main(String[] args) throws Exception { String uri = "hdfs://localhost:9000/"; Configuration config = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), config); // hdfs/user/ckelsel/ FileStatus[] statuses = fs.listStatus(new Path("/user/ckelsel")); for (FileStatus status : statuses) { System.out.println(status); }//from www.j a v a 2 s.com // hdfs/user/ckelsel FSDataOutputStream os = fs.create(new Path("/user/ckelsel/test.log")); os.write("Hello World!".getBytes()); os.flush(); os.close(); // hdfs/user/ckelsel InputStream is = fs.open(new Path("/user/ckelsel/test.log")); IOUtils.copyBytes(is, System.out, 1024, true); }
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.// ww w. j a va 2 s. com */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.cloudera.ByteBufferRecordReader.java
License:Apache License
private void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart;//from w ww .j a v a 2s .c o m end = start + splitLength; pos = start; // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); this.readStats = new ReadStatistics(); this.bufferPool = new ElasticByteBufferPool(); boolean skipChecksums = job.getBoolean("bytecount.skipChecksums", false); this.readOption = skipChecksums ? EnumSet.of(ReadOption.SKIP_CHECKSUMS) : EnumSet.noneOf(ReadOption.class); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; LOG.info("Split pos = " + start + " length " + splitLength); } }
From source file:com.cloudera.cdk.morphline.hadoop.rcfile.ReadRCFileTest.java
License:Apache License
private InputStream readPath(final Path inputFile) throws IOException { FileSystem fs = inputFile.getFileSystem(new Configuration()); return fs.open(inputFile); }
From source file:com.cloudera.cdk.tools.JobClasspathHelper.java
License:Apache License
/** * /*from w w w .j a v a 2s. com*/ * @param conf * Configuration object for the Job. Used to get the FileSystem associated with it. * @param libDir * Destination directory in the FileSystem (Usually HDFS) where to upload and look for the libs. * @param classesToInclude * Classes that are needed by the job. JarFinder will look for the jar containing these classes. * @throws Exception */ public void prepareClasspath(final Configuration conf, final Path libDir, Class<?>... classesToInclude) throws Exception { FileSystem fs = null; List<Class<?>> classList = new ArrayList<Class<?>>(Arrays.asList(classesToInclude)); fs = FileSystem.get(conf); Map<String, String> jarMd5Map = new TreeMap<String, String>(); // for each classes we use JarFinder to locate the jar in the local classpath. for (Class<?> clz : classList) { if (clz != null) { String localJarPath = JarFinder.getJar(clz); // we don't want to upload the same jar twice if (!jarMd5Map.containsKey(localJarPath)) { // We should not push core Hadoop classes with this tool. // Should it be the responsibility of the developer or we let // this fence here? if (!clz.getName().startsWith("org.apache.hadoop.")) { // we compute the MD5 sum of the local jar InputStream in = new FileInputStream(localJarPath); boolean threw = true; try { String md5sum = DigestUtils.md5Hex(in); jarMd5Map.put(localJarPath, md5sum); threw = false; } finally { Closeables.close(in, threw); } } else { logger.info("Ignoring {}, since it looks like it's from Hadoop's core libs", localJarPath); } } } } for (Entry<String, String> entry : jarMd5Map.entrySet()) { Path localJarPath = new Path(entry.getKey()); String jarFilename = localJarPath.getName(); String localMd5sum = entry.getValue(); logger.info("Jar {}. MD5 : [{}]", localJarPath, localMd5sum); Path remoteJarPath = new Path(libDir, jarFilename); Path remoteMd5Path = new Path(libDir, jarFilename + ".md5"); // If the jar file does not exist in HDFS or if the MD5 file does not exist in HDFS, // we force the upload of the jar. if (!fs.exists(remoteJarPath) || !fs.exists(remoteMd5Path)) { copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path); } else { // If the jar exist,we validate the MD5 file. // If the MD5 sum is different, we upload the jar FSDataInputStream md5FileStream = null; String remoteMd5sum = ""; try { md5FileStream = fs.open(remoteMd5Path); byte[] md5bytes = new byte[32]; if (32 == md5FileStream.read(md5bytes)) { remoteMd5sum = new String(md5bytes, Charsets.UTF_8); } } finally { if (md5FileStream != null) { md5FileStream.close(); } } if (localMd5sum.equals(remoteMd5sum)) { logger.info("Jar {} already exists [{}] and md5sum are equals", jarFilename, remoteJarPath.toUri().toASCIIString()); } else { logger.info("Jar {} already exists [{}] and md5sum are different!", jarFilename, remoteJarPath.toUri().toASCIIString()); copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path); } } // In all case we want to add the jar to the DistributedCache's classpath DistributedCache.addFileToClassPath(remoteJarPath, conf, fs); } // and we create the symlink (was necessary in earlier versions of Hadoop) DistributedCache.createSymlink(conf); }
From source file:com.cloudera.circus.test.TestXTest.java
License:Open Source License
@Test @TestHadoop//from w ww . j ava2 s . co m public void testHadoopFileSystem() throws Exception { JobConf conf = getHadoopConf(); FileSystem fs = FileSystem.get(conf); try { OutputStream os = fs.create(new Path(getHadoopTestDir(), "foo")); os.write(new byte[] { 1 }); os.close(); InputStream is = fs.open(new Path(getHadoopTestDir(), "foo")); Assert.assertEquals(is.read(), 1); Assert.assertEquals(is.read(), -1); is.close(); } finally { fs.close(); } }