Example usage for org.apache.hadoop.conf Configuration getConfResourceAsInputStream

List of usage examples for org.apache.hadoop.conf Configuration getConfResourceAsInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getConfResourceAsInputStream.

Prototype

public InputStream getConfResourceAsInputStream(String name) 

Source Link

Document

Get an input stream attached to the configuration resource with the given name.

Usage

From source file:cn.edu.hfut.dmic.webcollectorcluster.crawler.Crawler.java

@Override
public Generator createGenerator(Generator generator) {

    if (interval == null) {
        interval = CrawlerConfiguration.create().getLong("generator.interval", -1);
    }//from ww  w.  ja v a 2s  .c om
    try {
        Configuration conf = CrawlerConfiguration.create();

        InputStream regexIs = conf.getConfResourceAsInputStream("regex");
        BufferedReader br = new BufferedReader(new InputStreamReader(regexIs));
        ArrayList<String> regexRules = new ArrayList<String>();
        String line;
        while ((line = br.readLine()) != null) {
            regexRules.add(line);
        }

        //return new URLRegexFilter(generator, regexRules);
        return new URLRegexFilter(new IntervalFilter(generator, interval), regexRules);
    } catch (Exception ex) {
        LogUtils.getLogger().info("Exception", ex);
        return null;
    }

}

From source file:com.reidin.ppd.listings.date.DateListingsFilter.java

License:Apache License

private HashMap<String, HashMap<String, String>> readRules(Configuration conf) throws IOException {
    BufferedReader reader = new BufferedReader(
            new InputStreamReader(conf.getConfResourceAsInputStream(conf.get(DATE_CONFIG_FILE))));
    String line = null;//www . j a v  a2  s .  c om
    HashMap<String, HashMap<String, String>> map = new HashMap<String, HashMap<String, String>>();
    while ((line = reader.readLine()) != null) {
        if (!line.startsWith("#") && line.contains("=") && line.contains("|")) {
            String[] strings = line.split("=");
            String[] keys = strings[0].split("\\|");
            HashMap<String, String> values = map.get(keys[0]);
            if (values == null) {
                values = new HashMap<String, String>();
            }
            values.put(keys[1], strings[1]);
            map.put(keys[0], values);
        }
    }
    return map;

}

From source file:org.apache.nutch.exchange.Exchanges.java

License:Apache License

/**
 * Loads the configuration of each exchange.
 *
 * @param conf Nutch's configuration./*from  www  .j a va 2 s .  co m*/
 * @return An array with each exchange's configuration.
 */
private ExchangeConfig[] loadConfigurations(Configuration conf) {
    String filename = conf.get("exchanges.exchanges.file", "exchanges.xml");
    InputSource inputSource = new InputSource(conf.getConfResourceAsInputStream(filename));

    final List<ExchangeConfig> configList = new LinkedList<>();

    try {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        Element rootElement = builder.parse(inputSource).getDocumentElement();
        NodeList exchangeList = rootElement.getElementsByTagName("exchange");

        for (int i = 0; i < exchangeList.getLength(); i++) {
            Element element = (Element) exchangeList.item(i);
            ExchangeConfig exchangeConfig = ExchangeConfig.getInstance(element);

            if ("default".equals(exchangeConfig.getClazz())) {
                this.defaultExchangeConfig = exchangeConfig;
                continue;
            }

            configList.add(exchangeConfig);
        }

    } catch (SAXException | IOException | ParserConfigurationException e) {
        LOG.error(e.toString());
    }

    return configList.toArray(new ExchangeConfig[0]);
}

From source file:org.apache.nutch.parse.ParsePluginsReader.java

License:Apache License

/**
 * Reads the <code>parse-plugins.xml</code> file and returns the
 * {@link #ParsePluginList} defined by it.
 * /*ww  w  .j a va  2s.com*/
 * @return A {@link #ParsePluginList} specified by the
 *         <code>parse-plugins.xml</code> file.
 * @throws Exception
 *           If any parsing error occurs.
 */
public ParsePluginList parse(Configuration conf) {

    ParsePluginList pList = new ParsePluginList();

    // open up the XML file
    DocumentBuilderFactory factory = null;
    DocumentBuilder parser = null;
    Document document = null;
    InputSource inputSource = null;

    InputStream ppInputStream = null;
    if (fParsePluginsFile != null) {
        URL parsePluginUrl = null;
        try {
            parsePluginUrl = new URL(fParsePluginsFile);
            ppInputStream = parsePluginUrl.openStream();
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Unable to load parse plugins file from URL " + "[" + fParsePluginsFile
                        + "]. Reason is [" + e + "]");
            }
            return pList;
        }
    } else {
        ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
    }

    inputSource = new InputSource(ppInputStream);

    try {
        factory = DocumentBuilderFactory.newInstance();
        parser = factory.newDocumentBuilder();
        document = parser.parse(inputSource);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is [" + e + "]");
        }
        return null;
    }

    Element parsePlugins = document.getDocumentElement();

    // build up the alias hash map
    Map<String, String> aliases = getAliases(parsePlugins);
    // And store it on the parse plugin list
    pList.setAliases(aliases);

    // get all the mime type nodes
    NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");

    // iterate through the mime types
    for (int i = 0; i < mimeTypes.getLength(); i++) {
        Element mimeType = (Element) mimeTypes.item(i);
        String mimeTypeStr = mimeType.getAttribute("name");

        // for each mimeType, get the plugin list
        NodeList pluginList = mimeType.getElementsByTagName("plugin");

        // iterate through the plugins, add them in order read
        // OR if they have a special order="" attribute, then hold those in
        // a separate list, and then insert them into the final list at the
        // order specified
        if (pluginList != null && pluginList.getLength() > 0) {
            List<String> plugList = new ArrayList<String>(pluginList.getLength());

            for (int j = 0; j < pluginList.getLength(); j++) {
                Element plugin = (Element) pluginList.item(j);
                String pluginId = plugin.getAttribute("id");
                String extId = aliases.get(pluginId);
                if (extId == null) {
                    // Assume an extension id is directly specified
                    extId = pluginId;
                }
                String orderStr = plugin.getAttribute("order");
                int order = -1;
                try {
                    order = Integer.parseInt(orderStr);
                } catch (NumberFormatException ignore) {
                }
                if (order != -1) {
                    plugList.add(order - 1, extId);
                } else {
                    plugList.add(extId);
                }
            }

            // now add the plugin list and map it to this mimeType
            pList.setPluginList(mimeTypeStr, plugList);

        } else if (LOG.isWarnEnabled()) {
            LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: " + mimeTypeStr
                    + ", continuing parse");
        }
    }
    return pList;
}

From source file:org.apache.nutch.protocol.http.proxy.api.HttpBase.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    this.webProtectSkip = conf.getBoolean(MyConstant.SKIP_WEB_CRAWL_PROTECT, false);// ??
    this.proxyHost = conf.get("http.proxy.host");
    this.proxyPort = conf.getInt("http.proxy.port", 8080);
    // ?IP/* w  w  w .j av a2  s .c o m*/
    this.proxyReqMax = conf.getInt("http.proxy.reqmax", 500);

    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
    this.timeout = conf.getInt("http.timeout", 10000);
    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
    this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"),
            conf.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
    this.accept = conf.get("http.accept", accept);
    // backward-compatible default setting
    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
    this.responseTime = conf.getBoolean("http.store.responsetime", true);
    this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
    this.robots.setConf(conf);

    // NUTCH-1941: read list of alternating agent names
    if (conf.getBoolean("http.agent.rotate", false)) {
        String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
        BufferedReader br = null;
        try {
            Reader reader = conf.getConfResourceAsReader(agentsFile);
            br = new BufferedReader(reader);
            userAgentNames = new ArrayList<String>();
            String word = "";
            while ((word = br.readLine()) != null) {
                if (!word.trim().isEmpty())
                    userAgentNames.add(word.trim());
            }

            if (userAgentNames.size() == 0) {
                logger.warn("Empty list of user agents in http.agent.rotate.file {}", agentsFile);
                userAgentNames = null;
            }

        } catch (Exception e) {
            logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
                    StringUtils.stringifyException(e));
            userAgentNames = null;
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        }
        if (userAgentNames == null) {
            logger.warn("Falling back to fixed user agent set via property http.agent.name");
        }
    }

    String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1",
            "SSLv3");
    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
            "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
            "TLS_RSA_WITH_AES_256_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
            "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
            "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
            "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
            "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
            "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
            "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
            "TLS_RSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
            "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
            "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
            "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
            "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
            "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
            "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
            "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
            "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
            "SSL_RSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
            "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
            "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
            "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", "TLS_ECDHE_ECDSA_WITH_NULL_SHA",
            "TLS_ECDHE_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
            "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", "SSL_RSA_WITH_DES_CBC_SHA",
            "SSL_DHE_RSA_WITH_DES_CBC_SHA", "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
            "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", "TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
            "TLS_KRB5_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_DES_CBC_MD5");

    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));

    logConf();

    InputStream is = null;
    BufferedReader dr = null;
    try {
        LOGGER.info("??.........");
        is = conf.getConfResourceAsInputStream("proxylist.conf");
        if (is == null) {
            return;
        }
        dr = new BufferedReader(new InputStreamReader(is));
        String tmp = null;
        while ((tmp = dr.readLine()) != null) {
            if (!"".equals(tmp) && !tmp.startsWith("#")) {
                proxyList.add(tmp);
                LOGGER.info(tmp);
            }
        }
        LOGGER.info("????");
        if (!proxyList.isEmpty()) {
            this.useProxy = true;
        }
    } catch (Exception e) {
        logger.error("custom proxylist read error :", e);
    } finally {
        if (dr != null) {
            try {
                dr.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:org.apache.nutch.tools.PruneIndexTool.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length == 0) {
        usage();/*w  ww .  ja v  a 2  s.co  m*/
        if (LOG.isFatalEnabled()) {
            LOG.fatal("Missing arguments");
        }
        return;
    }
    File idx = new File(args[0]);
    if (!idx.isDirectory()) {
        usage();
        if (LOG.isFatalEnabled()) {
            LOG.fatal("Not a directory: " + idx);
        }
        return;
    }
    Vector paths = new Vector();
    if (IndexReader.indexExists(idx)) {
        paths.add(idx);
    } else {
        // try and see if there are segments inside, with index dirs
        File[] dirs = idx.listFiles(new FileFilter() {
            public boolean accept(File f) {
                return f.isDirectory();
            }
        });
        if (dirs == null || dirs.length == 0) {
            usage();
            if (LOG.isFatalEnabled()) {
                LOG.fatal("No indexes in " + idx);
            }
            return;
        }
        for (int i = 0; i < dirs.length; i++) {
            File sidx = new File(dirs[i], "index");
            if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
                paths.add(sidx);
            }
        }
        if (paths.size() == 0) {
            usage();
            if (LOG.isFatalEnabled()) {
                LOG.fatal("No indexes in " + idx + " or its subdirs.");
            }
            return;
        }
    }
    File[] indexes = (File[]) paths.toArray(new File[0]);
    boolean force = false;
    boolean dryrun = false;
    String qPath = null;
    String outPath = null;
    String fList = null;
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-force")) {
            force = true;
        } else if (args[i].equals("-queries")) {
            qPath = args[++i];
        } else if (args[i].equals("-output")) {
            outPath = args[++i];
        } else if (args[i].equals("-showfields")) {
            fList = args[++i];
        } else if (args[i].equals("-dryrun")) {
            dryrun = true;
        } else {
            usage();
            if (LOG.isFatalEnabled()) {
                LOG.fatal("Unrecognized option: " + args[i]);
            }
            return;
        }
    }
    Vector cv = new Vector();
    if (fList != null) {
        StringTokenizer st = new StringTokenizer(fList, ",");
        Vector tokens = new Vector();
        while (st.hasMoreTokens())
            tokens.add(st.nextToken());
        String[] fields = (String[]) tokens.toArray(new String[0]);
        PruneChecker pc = new PrintFieldsChecker(System.out, fields);
        cv.add(pc);
    }

    if (outPath != null) {
        StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false);
        cv.add(luc);
    }

    PruneChecker[] checkers = null;
    if (cv.size() > 0) {
        checkers = (PruneChecker[]) cv.toArray(new PruneChecker[0]);
    }
    Query[] queries = null;
    InputStream is = null;
    if (qPath != null) {
        is = new FileInputStream(qPath);
    } else {
        Configuration conf = NutchConfiguration.create();
        qPath = conf.get("prune.index.tool.queries");
        is = conf.getConfResourceAsInputStream(qPath);
    }
    if (is == null) {
        if (LOG.isFatalEnabled()) {
            LOG.fatal("Can't load queries from " + qPath);
        }
        return;
    }
    try {
        queries = parseQueries(is);
    } catch (Exception e) {
        if (LOG.isFatalEnabled()) {
            LOG.fatal("Error parsing queries: " + e.getMessage());
        }
        return;
    }
    try {
        PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun);
        pit.run();
    } catch (Exception e) {
        if (LOG.isFatalEnabled()) {
            LOG.fatal("Error running PruneIndexTool: " + e.getMessage());
        }
        return;
    }
}

From source file:org.apache.nutch.util.MimeUtil.java

License:Apache License

public MimeUtil(Configuration conf) {
    tika = new Tika();
    ObjectCache objectCache = ObjectCache.get(conf);
    MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class.getName());
    if (mimeTypez == null) {
        try {/*from   www  . ja  va  2 s .co m*/
            String customMimeTypeFile = conf.get("mime.types.file");
            if (customMimeTypeFile != null && customMimeTypeFile.equals("") == false) {
                try {
                    mimeTypez = MimeTypesFactory.create(conf.getConfResourceAsInputStream(customMimeTypeFile));
                } catch (Exception e) {
                    LOG.error("Can't load mime.types.file : " + customMimeTypeFile + " using Tika's default");
                }
            }
            if (mimeTypez == null)
                mimeTypez = MimeTypes.getDefaultMimeTypes();
        } catch (Exception e) {
            LOG.error("Exception in MimeUtil " + e.getMessage());
            throw new RuntimeException(e);
        }
        objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
    }

    this.mimeTypes = mimeTypez;
    this.mimeMagic = conf.getBoolean("mime.type.magic", true);
}