Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:be.ugent.intec.halvade.HalvadeOptions.java

License:Open Source License

protected void parseDictFile(Configuration conf) {
    be.ugent.intec.halvade.utils.Logger.DEBUG("parsing dictionary " + ref + DICT_SUFFIX);
    try {//  www  . j ava 2  s .  c  o  m
        FileSystem fs = FileSystem.get(new URI(ref + DICT_SUFFIX), conf);
        FSDataInputStream stream = fs.open(new Path(ref + DICT_SUFFIX));
        String line = getLine(stream); // header
        dict = new SAMSequenceDictionary();
        line = getLine(stream);
        while (line != null) {
            String[] lineData = line.split("\\s+");
            String seqName = lineData[1].substring(lineData[1].indexOf(':') + 1);
            int seqLength = 0;
            try {
                seqLength = Integer.parseInt(lineData[2].substring(lineData[2].indexOf(':') + 1));
            } catch (NumberFormatException ex) {
                be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex);
            }
            SAMSequenceRecord seq = new SAMSequenceRecord(seqName, seqLength);
            //                Logger.DEBUG("name: " + seq.getSequenceName() + " length: " + seq.getSequenceLength());
            dict.addSequence(seq);
            line = getLine(stream);
        }
        HalvadeConf.setSequenceDictionary(conf, dict);
    } catch (URISyntaxException | IOException ex) {
        be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex);
    }

}

From source file:be.ugent.intec.halvade.uploader.input.BaseFileReader.java

protected static BufferedReader getReader(boolean readFromDistributedStorage, String file)
        throws FileNotFoundException, IOException {
    InputStream hdfsIn;/*from w w  w  . j a  va 2s .c o m*/
    if (readFromDistributedStorage) {
        Path pt = new Path(file);
        FileSystem fs = FileSystem.get(pt.toUri(), new Configuration());
        hdfsIn = fs.open(pt);
        // read the stream in the correct format!
        if (file.endsWith(".gz")) {
            GZIPInputStream gzip = new GZIPInputStream(hdfsIn, BUFFERSIZE);
            return new BufferedReader(new InputStreamReader(gzip));
        } else if (file.endsWith(".bz2")) {
            CBZip2InputStream bzip2 = new CBZip2InputStream(hdfsIn);
            return new BufferedReader(new InputStreamReader(bzip2));
        } else
            return new BufferedReader(new InputStreamReader(hdfsIn));

    } else {
        if (file.endsWith(".gz")) {
            GZIPInputStream gzip = new GZIPInputStream(new FileInputStream(file), BUFFERSIZE);
            return new BufferedReader(new InputStreamReader(gzip));
        } else if (file.endsWith(".bz2")) {
            CBZip2InputStream bzip2 = new CBZip2InputStream(new FileInputStream(file));
            return new BufferedReader(new InputStreamReader(bzip2));
        } else if (file.equals("-")) {
            return new BufferedReader(new InputStreamReader(System.in));
        } else
            return new BufferedReader(new FileReader(file));
    }
}

From source file:be.ugent.intec.halvade.utils.ChromosomeSplitter.java

License:Open Source License

private void importSplitter(String filename, Configuration conf) throws URISyntaxException, IOException {
    regions = new ArrayList();
    DataInputStream dis = null;/*ww  w .j a  v  a  2s.  c o  m*/
    FileSystem hdfs = null;
    try {
        hdfs = FileSystem.get(new URI(filename), conf);
        Path file = new Path(filename);
        InputStream is = hdfs.open(file);
        dis = new DataInputStream(is);
        int len = dis.readInt();
        for (int i = 0; i < len; i++) {
            String contig = dis.readUTF();
            int start = dis.readInt();
            int end = dis.readInt();
            int key = dis.readInt();
            regions.add(new BedRegion(contig, start, end, key));
        }
    } finally {
        if (dis != null)
            dis.close();
    }
}

From source file:bigsatgps.BigDataHandler.java

License:Open Source License

/**
 *
 * @param infile//from w  w  w .  j a v  a 2s.c  o m
 * @return
 * @throws Exception
 */
public String ImageToSequence(String infile) throws Exception {
    String log4jConfPath = "lib/log4j.properties";
    PropertyConfigurator.configure(log4jConfPath);
    confHadoop = new Configuration();
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
    FileSystem fs = FileSystem.get(confHadoop);
    Path inPath = new Path(infile);
    String outfile = infile.substring(0, infile.indexOf(".")) + ".seq";
    Path outPath = new Path(outfile);
    System.out.println();
    System.out.println("Successfully created the sequencefile " + outfile);
    FSDataInputStream in = null;
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    SequenceFile.Writer writer = null;
    try {
        in = fs.open(inPath);
        byte buffer[] = new byte[in.available()];
        in.read(buffer);
        writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass());
        writer.append(new Text(inPath.getName()), new BytesWritable(buffer));
        IOUtils.closeStream(writer);
        return outfile;
    } catch (IOException e) {
        System.err.println("Exception MESSAGES = " + e.getMessage());
        IOUtils.closeStream(writer);
        return null;
    }
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }/* w  w w .  ja  v  a  2s.  com*/
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }/*from  ww w . j  av  a 2  s .co  m*/
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:biz.hangyang.knnspark.spark.KNNClassifySpark.java

public static JavaPairRDD<Entity, Object> calKDistance(final String trainingDataPath, String testingDataPath,
        final int k, final Map<Object, Double> weightMap, JavaSparkContext sc, int partition,
        final Accumulator<Integer> accum) {
    JavaRDD<String> testingDataRDD = sc.textFile(testingDataPath, partition);
    //?Entity//from  w ww. jav  a 2  s  . co  m
    JavaRDD<Entity> testingEntityRDD = testingDataRDD.map(new Function<String, Entity>() {
        @Override
        public Entity call(String line) throws Exception {
            return new GeneEntity(line);
        }
    });
    //??????K??KV
    JavaPairRDD<Entity, KDistance> ekRDD = testingEntityRDD
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Entity>, Entity, KDistance>() {
                @Override
                public Iterable<Tuple2<Entity, KDistance>> call(Iterator<Entity> t) throws Exception {
                    //?PARTITION?
                    List<Entity> entityList = new ArrayList<>();
                    while (t.hasNext()) {
                        entityList.add(t.next());
                    }
                    //??LIST
                    List<KDistance> kDistanceList = new ArrayList<>();
                    for (int i = 0; i < entityList.size(); i++) {
                        kDistanceList.add(new KDistance(k));
                    }

                    //???hdfs
                    Configuration conf = new Configuration();
                    FileSystem fs = FileSystem.get(URI.create(trainingDataPath), conf);
                    FSDataInputStream in = fs.open(new Path(trainingDataPath));
                    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
                    String line;
                    while ((line = br.readLine()) != null) {
                        Entity lineEntity = new GeneEntity(line);
                        for (int i = 0; i < entityList.size(); i++) {
                            kDistanceList.get(i).add(new DemoDistanceCatagory(
                                    lineEntity.distance(entityList.get(i)), lineEntity.category));
                        }
                    }

                    List<Tuple2<Entity, KDistance>> tList = new ArrayList<>();
                    for (int i = 0; i < entityList.size(); i++) {
                        tList.add(new Tuple2<>(entityList.get(i), kDistanceList.get(i)));
                    }
                    return tList;
                }
            });

    JavaPairRDD<Entity, Object> eoRDD = ekRDD
            .mapToPair(new PairFunction<Tuple2<Entity, KDistance>, Entity, Object>() {
                @Override
                public Tuple2<Entity, Object> call(Tuple2<Entity, KDistance> t) throws Exception {
                    KDistance kDistance = t._2();
                    //???
                    Object catagory = KDistance.getCatagory(kDistance.get(), weightMap);
                    if (t._1().category.equals(catagory)) {
                        accum.add(1);
                    }
                    return new Tuple2<>(t._1(), catagory);
                }
            });

    return eoRDD;
}

From source file:boa.functions.BoaIntrinsics.java

License:Apache License

/**
 * Given the model URL, deserialize the model and return Model type
 *
 * @param Take URL for the model/*w w  w.  j  ava2  s .  c o  m*/
 * @return Model type after deserializing
 */
// TODO Take complete URL and then deserialize the model
// FIXME Returning Object as a type, this needs to be changed once we defined Model Type
@FunctionSpec(name = "load", returnType = "Model", formalParameters = { "string" })
public static Object load(final String URL) throws Exception {
    Object unserializedObject = null;
    FSDataInputStream in = null;
    try {
        final Configuration conf = new Configuration();
        final FileSystem fileSystem = FileSystem.get(conf);
        final Path path = new Path("hdfs://boa-njt" + URL);

        if (in != null)
            try {
                in.close();
            } catch (final Exception e) {
                e.printStackTrace();
            }

        in = fileSystem.open(path);
        int numBytes = 0;
        final byte[] b = new byte[(int) fileSystem.getLength(path) + 1];
        long length = 0;

        in.read(b);

        ByteArrayInputStream bin = new ByteArrayInputStream(b);
        ObjectInputStream dataIn = new ObjectInputStream(bin);
        unserializedObject = dataIn.readObject();
        dataIn.close();
    } catch (Exception ex) {
    }
    return unserializedObject;
}

From source file:boa.io.BoaOutputCommitter.java

License:Apache License

private void storeOutput(final JobContext context, final int jobId) {
    if (jobId == 0)
        return;/*from   ww  w.j  a  v  a2 s. co m*/

    Connection con = null;
    FileSystem fileSystem = null;
    FSDataInputStream in = null;
    FSDataOutputStream out = null;

    try {
        fileSystem = outputPath.getFileSystem(context.getConfiguration());

        con = DriverManager.getConnection(url, user, password);

        PreparedStatement ps = null;
        try {
            ps = con.prepareStatement("INSERT INTO boa_output (id, length) VALUES (" + jobId + ", 0)");
            ps.executeUpdate();
        } catch (final Exception e) {
        } finally {
            try {
                if (ps != null)
                    ps.close();
            } catch (final Exception e) {
                e.printStackTrace();
            }
        }

        fileSystem.mkdirs(new Path("/boa", new Path("" + jobId)));
        out = fileSystem.create(new Path("/boa", new Path("" + jobId, new Path("output.txt"))));

        int partNum = 0;

        final byte[] b = new byte[64 * 1024 * 1024];
        long length = 0;
        boolean hasWebResult = false;

        while (true) {
            final Path path = new Path(outputPath, "part-r-" + String.format("%05d", partNum++));
            if (!fileSystem.exists(path))
                break;

            if (in != null)
                try {
                    in.close();
                } catch (final Exception e) {
                    e.printStackTrace();
                }
            in = fileSystem.open(path);

            int numBytes = 0;

            while ((numBytes = in.read(b)) > 0) {
                if (!hasWebResult) {
                    hasWebResult = true;

                    try {
                        ps = con.prepareStatement("UPDATE boa_output SET web_result=? WHERE id=" + jobId);
                        int webSize = 64 * 1024 - 1;
                        ps.setString(1, new String(b, 0, numBytes < webSize ? numBytes : webSize));
                        ps.executeUpdate();
                    } finally {
                        try {
                            if (ps != null)
                                ps.close();
                        } catch (final Exception e) {
                            e.printStackTrace();
                        }
                    }
                }
                out.write(b, 0, numBytes);
                length += numBytes;

                this.context.progress();
            }
        }

        try {
            ps = con.prepareStatement("UPDATE boa_output SET length=? WHERE id=" + jobId);
            ps.setLong(1, length);
            ps.executeUpdate();
        } finally {
            try {
                if (ps != null)
                    ps.close();
            } catch (final Exception e) {
                e.printStackTrace();
            }
        }
    } catch (final Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (con != null)
                con.close();
        } catch (final Exception e) {
            e.printStackTrace();
        }
        try {
            if (in != null)
                in.close();
        } catch (final Exception e) {
            e.printStackTrace();
        }
        try {
            if (out != null)
                out.close();
        } catch (final Exception e) {
            e.printStackTrace();
        }
        try {
            if (fileSystem != null)
                fileSystem.close();
        } catch (final Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:boostingPL.MR.AdaBoostPLMapper.java

License:Open Source License

/** create instances header */
protected void setup(Context context) throws IOException, InterruptedException {
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();//  w  ww  .j av  a 2  s  .  co m
    dis.close();
}