Example usage for org.apache.commons.lang CharEncoding isSupported

List of usage examples for org.apache.commons.lang CharEncoding isSupported

Introduction

In this page you can find the example usage for org.apache.commons.lang CharEncoding isSupported.

Prototype

public static boolean isSupported(String name) 

Source Link

Document

Returns whether the named charset is supported.

Usage

From source file:org.cubictest.persistence.TestPersistance.java

public static String getCharset(File file) {
    String charset = null;//w  ww  . j  a  v a  2  s  . c  om
    try {
        Path location = new Path(file.getAbsolutePath());

        try {
            charset = ResourcesPlugin.getWorkspace().getRoot().getFileForLocation(location).getCharset(true);
        } catch (CoreException e) {
            e.printStackTrace();
        }
        if (charset == null)
            charset = ResourcesPlugin.getEncoding();
    } catch (Exception e) {
        try {
            String test = FileUtils.readFileToString(file);
            if (test.startsWith("<?xml")) {
                int start = test.indexOf("encoding=\"") + 10;
                int end = test.indexOf("\"?>", start);
                String encoding = test.substring(start, end);
                if (CharEncoding.isSupported(encoding))
                    return encoding;
            }
        } catch (IOException e2) {
        }
    }
    if (charset == null)
        charset = "ISO-8859-1";
    return charset;
}

From source file:org.terrier.indexing.MultiDocumentFileCollection.java

protected void extractCharset() {
    DocProperties.put("charset", desiredEncoding);
    //obtain the character set of the document and put in the charset property
    String cType = DocProperties.get("content-type");
    //force UTF-8 for english documents - webpage isnt clear:
    //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings
    if (cType != null) {
        cType = cType.toLowerCase();/*from  ww  w  .  jav  a2 s  .  co  m*/
        if (cType.contains("charset")) {
            final Matcher m = charsetMatchPattern.matcher(cType);
            if (m.find() && m.groupCount() > 0) {
                String charset = StringTools.normaliseEncoding(m.group(1));
                if (CharEncoding.isSupported(charset))
                    DocProperties.put("charset", charset);
            }
        }
    }
    if (forceUTF8)
        DocProperties.put("charset", "utf-8");
}

From source file:org.terrier.indexing.TRECWebCollection.java

@Override
protected void afterPropertyTags() throws IOException {
    StringBuilder hdr = super.getTag(DOCHDR_START_LENGTH, DOCHDR_START, DOCHDR_END);
    if (hdr == null) {
        logger.info("No header found for document " + super.ThisDocID);
        return;/* w  w w .j av a2s .c om*/
    }
    final String[] lines = hdr.toString().split("\\n+");
    boolean first = false;
    for (int i = 0; i < lines.length; i++) {
        if (lines[i].length() == 0)
            continue;

        if (!first) { //first line is a special case
            first = true;
            final String[] parts = lines[i].split("\\s+");
            switch (parts.length) {
            //GOV,GOV2,W3C,CERC
            case 1:
                DocProperties.put("url", parts[0]);
                break;
            //Blog06, Blogs08
            case 4:
                DocProperties.put("url", parts[0]);
                DocProperties.put("docbytelength", parts[3]);
                break;
            //WT2G, WT10G
            case 5:
                DocProperties.put("url", parts[0]);
                DocProperties.put("ip", parts[1]);
                DocProperties.put("crawldate", parseDate(parts[2]));
                DocProperties.put("content-type", parts[3]);
                DocProperties.put("docbytelength", parts[4]);
            }
        } else {
            int Colon;
            if ((Colon = lines[i].indexOf(':')) > 1) {
                /*
                   Content-Type: text/html
                   becomes
                   content-type => text/html
                   contenttype => text/html
                */

                DocProperties.put(lines[i].substring(0, Colon).trim().toLowerCase(),
                        lines[i].substring(Colon + 2).trim());
                DocProperties.put(lines[i].substring(0, Colon).trim().toLowerCase().replaceAll("-", ""),
                        lines[i].substring(Colon + 2).trim());
            }
        }
    } //for
    String cType = DocProperties.get("contenttype");
    if (cType == null)
        return;
    cType = cType.toLowerCase();
    if (cType.contains("charset")) {
        final Matcher m = CHARSET_PATTERN.matcher(cType);
        if (m.find() && m.groupCount() > 0) {
            try {
                String charset = m.group(1);
                charset = StringTools.normaliseEncoding(charset);
                if (CharEncoding.isSupported(charset))
                    DocProperties.put("charset", charset);
            } catch (IllegalStateException ise) {
                /* this shouldnt happen if m.groupCount > 0, but it does */ }
        }
    }
}

From source file:org.terrier.indexing.WARC018Collection.java

/** Move the collection to the start of the next document. */
public boolean nextDocument() {
    DocProperties = new HashMap<String, String>(15);
    try {/* ww  w  . ja v  a  2  s .  c o  m*/
        warcrecord: while (true) {
            String line = readLine();
            //logger.debug("Checking "+line + " for the magic warc header");
            //look for a warc line
            if (line.startsWith("WARC/0.18")) {
                //logger.debug("Found warc header");
                int headerSize = parseHeaders(true);
                //logger.debug("Parsed WARC headers in "+ headerSize + " bytes");
                final long warc_response_length = Long.parseLong(DocProperties.get("content-length"));
                //logger.debug("length of http message is "+warc_response_length);
                if (!DocProperties.get("warc-type").equals("response")) {
                    is.skip(warc_response_length);
                    //-49
                    continue warcrecord;
                }
                headerSize = parseHeaders(false);
                //logger.debug("Parsed HTTP headers in "+ headerSize + " bytes");
                DocProperties.put("docno", DocProperties.get(warc_docno_header));
                DocProperties.put("url", DocProperties.get(warc_url_header));
                DocProperties.put("crawldate", parseDate(DocProperties.get(warc_crawldate_header)));
                if (logger.isDebugEnabled())
                    logger.debug("Now working on document " + DocProperties.get("docno"));

                DocProperties.put("charset", desiredEncoding);
                //obtain the character set of the document and put in the charset property
                String cType = DocProperties.get("content-type");
                //force UTF-8 for english documents - webpage isnt clear:
                //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings
                if (cType != null) {
                    cType = cType.toLowerCase();
                    if (cType.contains("charset")) {
                        final Matcher m = charsetMatchPattern.matcher(cType);
                        if (m.find() && m.groupCount() > 0) {
                            String charset = StringTools.normaliseEncoding(m.group(1));
                            if (CharEncoding.isSupported(charset))
                                DocProperties.put("charset", charset);
                        }
                    }
                }
                if (forceUTF8)
                    DocProperties.put("charset", "utf-8");
                documentsInThisFile++;
                currentDocumentBlobLength = warc_response_length - headerSize; //-16
                return true;
            }
            if (eof) {
                if (documentsInThisFile == 0) {
                    logger.warn(this.getClass().getSimpleName() + " found no documents in "
                            + FilesToProcess.get(FileNumber - 1) + ". "
                            + "Perhaps trec.collection.class is wrongly set or decompression failed.");
                }
                if (!openNextFile())
                    return false;
            }
        }
    } catch (IOException ioe) {
        logger.error("IOException while reading WARC format collection file" + ioe);
    }
    return false;
}

From source file:org.terrier.indexing.WARC10Collection.java

/** Move the collection to the start of the next document. */
public boolean nextDocument() {
    DocProperties = new HashMap<String, String>(15);
    try {/*from w  w  w  . ja v a2 s .  co  m*/
        warcrecord: while (true) {
            String line = readLine();
            //logger.debug("Checking "+line + " for the magic warc header, found = " + line.startsWith("WARC/1.0"));
            //look for a warc line
            if (line.startsWith("WARC/1.0")) {
                //logger.debug("Found warc header");
                int headerSize = parseHeaders(true);
                //logger.debug("Parsed WARC headers in "+ headerSize + " bytes");
                final long warc_response_length = Long.parseLong(DocProperties.get("content-length"));
                //logger.debug("length of following message is "+warc_response_length);
                if (!DocProperties.get("warc-type").equals("response")) {
                    //System.err.println("Skipping warc-type of " + DocProperties.get("warc-type"));
                    is.skip(warc_response_length);
                    continue warcrecord;
                }

                headerSize = 0;
                //ignore all content before the HTTP line, e.g. clueweb12-1216wb-96-28178
                String statusLine = readLine();
                headerSize += readLineByteCount;
                while (!statusLine.startsWith("HTTP/")) {
                    statusLine = readLine();
                    headerSize += readLineByteCount;
                }
                final String[] statusParts = statusLine.split(" ", 3);
                int code;
                try {
                    code = Integer.parseInt(statusParts[1]);
                } catch (NumberFormatException nfe) {
                    throw new IOException("Invalid status line '" + statusLine + "' for "
                            + DocProperties.get(warc_docno_header));
                }
                headerSize += parseHeaders(false);
                if (code == 301 || code == 302) {
                    processRedirect(DocProperties.get(warc_url_header), DocProperties.get("location"));
                    //there is no need to skip, as there should only be headers
                    //is.skip(warc_response_length - headerSize);
                    continue warcrecord;
                } else if (code != 200) {
                    assert false : "Unsupported status code: " + code;
                }

                //logger.debug("Parsed HTTP headers in "+ headerSize + " bytes");
                DocProperties.put("docno", DocProperties.get(warc_docno_header));
                DocProperties.put("url", DocProperties.get(warc_url_header));
                DocProperties.put("crawldate", parseDate(DocProperties.get(warc_crawldate_header)));
                if (logger.isDebugEnabled())
                    logger.debug("Now working on document " + DocProperties.get("docno"));

                DocProperties.put("charset", desiredEncoding);
                //obtain the character set of the document and put in the charset property
                String cType = DocProperties.get("content-type");
                if (cType != null) {
                    cType = cType.toLowerCase();
                    if (cType.contains("charset")) {
                        final Matcher m = charsetMatchPattern.matcher(cType);
                        if (m.find() && m.groupCount() > 0) {
                            String charset = StringTools.normaliseEncoding(m.group(1));
                            if (CharEncoding.isSupported(charset))
                                DocProperties.put("charset", charset);
                        }
                    }
                }
                //force UTF-8 for english documents - webpage isnt clear:
                //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings
                if (forceUTF8)
                    DocProperties.put("charset", "utf-8");
                documentsInThisFile++;
                currentDocumentBlobLength = Math.max(0, warc_response_length - headerSize);
                //document clueweb12-0000tw-00-00010 has no content, causes a negative currentDocumentBlobLength

                //assert currentDocumentBlobLength > 0 : 
                //   "document "+DocProperties.get(warc_docno_header)+" must have size: "
                //   +"warc_response_length - headerSize - 26 = " 
                //   + warc_response_length + " - " + headerSize + " -26 " + " = " + currentDocumentBlobLength;

                return true;
            }
            if (eof) {
                if (documentsInThisFile == 0) {
                    String sourcemsg = FilesToProcess.size() > 0 ? FilesToProcess.get(FileNumber - 1)
                            : "input stream";
                    logger.warn(this.getClass().getSimpleName() + " found no documents in " + sourcemsg + ". "
                            + "Perhaps trec.collection.class is wrongly set or decompression failed.");
                }
                if (!openNextFile())
                    return false;
            }
        }
    } catch (IOException ioe) {
        logger.error("IOException while reading WARC format collection file" + ioe);
    }
    return false;
}