List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:be.ugent.intec.halvade.HalvadeOptions.java
License:Open Source License
protected void parseDictFile(Configuration conf) { be.ugent.intec.halvade.utils.Logger.DEBUG("parsing dictionary " + ref + DICT_SUFFIX); try {// www . j ava 2 s . c o m FileSystem fs = FileSystem.get(new URI(ref + DICT_SUFFIX), conf); FSDataInputStream stream = fs.open(new Path(ref + DICT_SUFFIX)); String line = getLine(stream); // header dict = new SAMSequenceDictionary(); line = getLine(stream); while (line != null) { String[] lineData = line.split("\\s+"); String seqName = lineData[1].substring(lineData[1].indexOf(':') + 1); int seqLength = 0; try { seqLength = Integer.parseInt(lineData[2].substring(lineData[2].indexOf(':') + 1)); } catch (NumberFormatException ex) { be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex); } SAMSequenceRecord seq = new SAMSequenceRecord(seqName, seqLength); // Logger.DEBUG("name: " + seq.getSequenceName() + " length: " + seq.getSequenceLength()); dict.addSequence(seq); line = getLine(stream); } HalvadeConf.setSequenceDictionary(conf, dict); } catch (URISyntaxException | IOException ex) { be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex); } }
From source file:be.ugent.intec.halvade.uploader.input.BaseFileReader.java
protected static BufferedReader getReader(boolean readFromDistributedStorage, String file) throws FileNotFoundException, IOException { InputStream hdfsIn;/*from w w w . j a va 2s .c o m*/ if (readFromDistributedStorage) { Path pt = new Path(file); FileSystem fs = FileSystem.get(pt.toUri(), new Configuration()); hdfsIn = fs.open(pt); // read the stream in the correct format! if (file.endsWith(".gz")) { GZIPInputStream gzip = new GZIPInputStream(hdfsIn, BUFFERSIZE); return new BufferedReader(new InputStreamReader(gzip)); } else if (file.endsWith(".bz2")) { CBZip2InputStream bzip2 = new CBZip2InputStream(hdfsIn); return new BufferedReader(new InputStreamReader(bzip2)); } else return new BufferedReader(new InputStreamReader(hdfsIn)); } else { if (file.endsWith(".gz")) { GZIPInputStream gzip = new GZIPInputStream(new FileInputStream(file), BUFFERSIZE); return new BufferedReader(new InputStreamReader(gzip)); } else if (file.endsWith(".bz2")) { CBZip2InputStream bzip2 = new CBZip2InputStream(new FileInputStream(file)); return new BufferedReader(new InputStreamReader(bzip2)); } else if (file.equals("-")) { return new BufferedReader(new InputStreamReader(System.in)); } else return new BufferedReader(new FileReader(file)); } }
From source file:be.ugent.intec.halvade.utils.ChromosomeSplitter.java
License:Open Source License
private void importSplitter(String filename, Configuration conf) throws URISyntaxException, IOException { regions = new ArrayList(); DataInputStream dis = null;/*ww w .j a v a 2s. c o m*/ FileSystem hdfs = null; try { hdfs = FileSystem.get(new URI(filename), conf); Path file = new Path(filename); InputStream is = hdfs.open(file); dis = new DataInputStream(is); int len = dis.readInt(); for (int i = 0; i < len; i++) { String contig = dis.readUTF(); int start = dis.readInt(); int end = dis.readInt(); int key = dis.readInt(); regions.add(new BedRegion(contig, start, end, key)); } } finally { if (dis != null) dis.close(); } }
From source file:bigsatgps.BigDataHandler.java
License:Open Source License
/** * * @param infile//from w w w . j a v a 2s.c o m * @return * @throws Exception */ public String ImageToSequence(String infile) throws Exception { String log4jConfPath = "lib/log4j.properties"; PropertyConfigurator.configure(log4jConfPath); confHadoop = new Configuration(); confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml")); confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml")); FileSystem fs = FileSystem.get(confHadoop); Path inPath = new Path(infile); String outfile = infile.substring(0, infile.indexOf(".")) + ".seq"; Path outPath = new Path(outfile); System.out.println(); System.out.println("Successfully created the sequencefile " + outfile); FSDataInputStream in = null; Text key = new Text(); BytesWritable value = new BytesWritable(); SequenceFile.Writer writer = null; try { in = fs.open(inPath); byte buffer[] = new byte[in.available()]; in.read(buffer); writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass()); writer.append(new Text(inPath.getName()), new BytesWritable(buffer)); IOUtils.closeStream(writer); return outfile; } catch (IOException e) { System.err.println("Exception MESSAGES = " + e.getMessage()); IOUtils.closeStream(writer); return null; } }
From source file:bixo.examples.crawl.MultiDomainUrlFilter.java
License:Apache License
public MultiDomainUrlFilter(Path filterFile) throws Exception { //we could require a filter file and put these in all urls or leave them here _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$"); _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://"); JobConf conf = HadoopUtils.getDefaultJobConf(); try {//process the file passed in if (filterFile != null) { FileSystem fs = filterFile.getFileSystem(conf); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader lr = new LineReader(in); Text tmpStr = new Text(); while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines String p = tmpStr.toString().trim();//remove whitespace if (p.substring(0, 1).equals("+")) {// '+' means do-crawl ArrayList filterPair = new ArrayList(); filterPair.add((Boolean) true); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } else if (p.substring(0, 1).equals("-")) {// '-' means filter out ArrayList filterPair = new ArrayList(); filterPair.add(new Boolean(false)); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } // otherwise a comment or malformed filter pattern }/* w w w . ja v a 2s. com*/ } } } catch (Exception e) { //any cleanup here? This would indicate a file system error, most likely throw e; } }
From source file:bixo.examples.crawl.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file system JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); LOGGER.info("Looking for file: " + urlFiltersFile); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }/*from ww w . j av a 2 s .co m*/ } in.close(); } else { LOGGER.info("Can't find file: " + urlFiltersFile); } return filterList; }
From source file:biz.hangyang.knnspark.spark.KNNClassifySpark.java
public static JavaPairRDD<Entity, Object> calKDistance(final String trainingDataPath, String testingDataPath, final int k, final Map<Object, Double> weightMap, JavaSparkContext sc, int partition, final Accumulator<Integer> accum) { JavaRDD<String> testingDataRDD = sc.textFile(testingDataPath, partition); //?Entity//from w ww. jav a 2 s . co m JavaRDD<Entity> testingEntityRDD = testingDataRDD.map(new Function<String, Entity>() { @Override public Entity call(String line) throws Exception { return new GeneEntity(line); } }); //??????K??KV JavaPairRDD<Entity, KDistance> ekRDD = testingEntityRDD .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Entity>, Entity, KDistance>() { @Override public Iterable<Tuple2<Entity, KDistance>> call(Iterator<Entity> t) throws Exception { //?PARTITION? List<Entity> entityList = new ArrayList<>(); while (t.hasNext()) { entityList.add(t.next()); } //??LIST List<KDistance> kDistanceList = new ArrayList<>(); for (int i = 0; i < entityList.size(); i++) { kDistanceList.add(new KDistance(k)); } //???hdfs Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(trainingDataPath), conf); FSDataInputStream in = fs.open(new Path(trainingDataPath)); BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); String line; while ((line = br.readLine()) != null) { Entity lineEntity = new GeneEntity(line); for (int i = 0; i < entityList.size(); i++) { kDistanceList.get(i).add(new DemoDistanceCatagory( lineEntity.distance(entityList.get(i)), lineEntity.category)); } } List<Tuple2<Entity, KDistance>> tList = new ArrayList<>(); for (int i = 0; i < entityList.size(); i++) { tList.add(new Tuple2<>(entityList.get(i), kDistanceList.get(i))); } return tList; } }); JavaPairRDD<Entity, Object> eoRDD = ekRDD .mapToPair(new PairFunction<Tuple2<Entity, KDistance>, Entity, Object>() { @Override public Tuple2<Entity, Object> call(Tuple2<Entity, KDistance> t) throws Exception { KDistance kDistance = t._2(); //??? Object catagory = KDistance.getCatagory(kDistance.get(), weightMap); if (t._1().category.equals(catagory)) { accum.add(1); } return new Tuple2<>(t._1(), catagory); } }); return eoRDD; }
From source file:boa.functions.BoaIntrinsics.java
License:Apache License
/** * Given the model URL, deserialize the model and return Model type * * @param Take URL for the model/*w w w. j ava2 s . c o m*/ * @return Model type after deserializing */ // TODO Take complete URL and then deserialize the model // FIXME Returning Object as a type, this needs to be changed once we defined Model Type @FunctionSpec(name = "load", returnType = "Model", formalParameters = { "string" }) public static Object load(final String URL) throws Exception { Object unserializedObject = null; FSDataInputStream in = null; try { final Configuration conf = new Configuration(); final FileSystem fileSystem = FileSystem.get(conf); final Path path = new Path("hdfs://boa-njt" + URL); if (in != null) try { in.close(); } catch (final Exception e) { e.printStackTrace(); } in = fileSystem.open(path); int numBytes = 0; final byte[] b = new byte[(int) fileSystem.getLength(path) + 1]; long length = 0; in.read(b); ByteArrayInputStream bin = new ByteArrayInputStream(b); ObjectInputStream dataIn = new ObjectInputStream(bin); unserializedObject = dataIn.readObject(); dataIn.close(); } catch (Exception ex) { } return unserializedObject; }
From source file:boa.io.BoaOutputCommitter.java
License:Apache License
private void storeOutput(final JobContext context, final int jobId) { if (jobId == 0) return;/*from ww w.j a v a2 s. co m*/ Connection con = null; FileSystem fileSystem = null; FSDataInputStream in = null; FSDataOutputStream out = null; try { fileSystem = outputPath.getFileSystem(context.getConfiguration()); con = DriverManager.getConnection(url, user, password); PreparedStatement ps = null; try { ps = con.prepareStatement("INSERT INTO boa_output (id, length) VALUES (" + jobId + ", 0)"); ps.executeUpdate(); } catch (final Exception e) { } finally { try { if (ps != null) ps.close(); } catch (final Exception e) { e.printStackTrace(); } } fileSystem.mkdirs(new Path("/boa", new Path("" + jobId))); out = fileSystem.create(new Path("/boa", new Path("" + jobId, new Path("output.txt")))); int partNum = 0; final byte[] b = new byte[64 * 1024 * 1024]; long length = 0; boolean hasWebResult = false; while (true) { final Path path = new Path(outputPath, "part-r-" + String.format("%05d", partNum++)); if (!fileSystem.exists(path)) break; if (in != null) try { in.close(); } catch (final Exception e) { e.printStackTrace(); } in = fileSystem.open(path); int numBytes = 0; while ((numBytes = in.read(b)) > 0) { if (!hasWebResult) { hasWebResult = true; try { ps = con.prepareStatement("UPDATE boa_output SET web_result=? WHERE id=" + jobId); int webSize = 64 * 1024 - 1; ps.setString(1, new String(b, 0, numBytes < webSize ? numBytes : webSize)); ps.executeUpdate(); } finally { try { if (ps != null) ps.close(); } catch (final Exception e) { e.printStackTrace(); } } } out.write(b, 0, numBytes); length += numBytes; this.context.progress(); } } try { ps = con.prepareStatement("UPDATE boa_output SET length=? WHERE id=" + jobId); ps.setLong(1, length); ps.executeUpdate(); } finally { try { if (ps != null) ps.close(); } catch (final Exception e) { e.printStackTrace(); } } } catch (final Exception e) { e.printStackTrace(); } finally { try { if (con != null) con.close(); } catch (final Exception e) { e.printStackTrace(); } try { if (in != null) in.close(); } catch (final Exception e) { e.printStackTrace(); } try { if (out != null) out.close(); } catch (final Exception e) { e.printStackTrace(); } try { if (fileSystem != null) fileSystem.close(); } catch (final Exception e) { e.printStackTrace(); } } }
From source file:boostingPL.MR.AdaBoostPLMapper.java
License:Open Source License
/** create instances header */ protected void setup(Context context) throws IOException, InterruptedException { String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close();// w ww .j av a 2 s . co m dis.close(); }