List of usage examples for org.apache.commons.lang CharEncoding isSupported
public static boolean isSupported(String name)
Returns whether the named charset is supported.
From source file:org.cubictest.persistence.TestPersistance.java
public static String getCharset(File file) { String charset = null;//w ww . j a v a 2 s . c om try { Path location = new Path(file.getAbsolutePath()); try { charset = ResourcesPlugin.getWorkspace().getRoot().getFileForLocation(location).getCharset(true); } catch (CoreException e) { e.printStackTrace(); } if (charset == null) charset = ResourcesPlugin.getEncoding(); } catch (Exception e) { try { String test = FileUtils.readFileToString(file); if (test.startsWith("<?xml")) { int start = test.indexOf("encoding=\"") + 10; int end = test.indexOf("\"?>", start); String encoding = test.substring(start, end); if (CharEncoding.isSupported(encoding)) return encoding; } } catch (IOException e2) { } } if (charset == null) charset = "ISO-8859-1"; return charset; }
From source file:org.terrier.indexing.MultiDocumentFileCollection.java
protected void extractCharset() { DocProperties.put("charset", desiredEncoding); //obtain the character set of the document and put in the charset property String cType = DocProperties.get("content-type"); //force UTF-8 for english documents - webpage isnt clear: //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings if (cType != null) { cType = cType.toLowerCase();/*from ww w . jav a2 s . co m*/ if (cType.contains("charset")) { final Matcher m = charsetMatchPattern.matcher(cType); if (m.find() && m.groupCount() > 0) { String charset = StringTools.normaliseEncoding(m.group(1)); if (CharEncoding.isSupported(charset)) DocProperties.put("charset", charset); } } } if (forceUTF8) DocProperties.put("charset", "utf-8"); }
From source file:org.terrier.indexing.TRECWebCollection.java
@Override protected void afterPropertyTags() throws IOException { StringBuilder hdr = super.getTag(DOCHDR_START_LENGTH, DOCHDR_START, DOCHDR_END); if (hdr == null) { logger.info("No header found for document " + super.ThisDocID); return;/* w w w .j av a2s .c om*/ } final String[] lines = hdr.toString().split("\\n+"); boolean first = false; for (int i = 0; i < lines.length; i++) { if (lines[i].length() == 0) continue; if (!first) { //first line is a special case first = true; final String[] parts = lines[i].split("\\s+"); switch (parts.length) { //GOV,GOV2,W3C,CERC case 1: DocProperties.put("url", parts[0]); break; //Blog06, Blogs08 case 4: DocProperties.put("url", parts[0]); DocProperties.put("docbytelength", parts[3]); break; //WT2G, WT10G case 5: DocProperties.put("url", parts[0]); DocProperties.put("ip", parts[1]); DocProperties.put("crawldate", parseDate(parts[2])); DocProperties.put("content-type", parts[3]); DocProperties.put("docbytelength", parts[4]); } } else { int Colon; if ((Colon = lines[i].indexOf(':')) > 1) { /* Content-Type: text/html becomes content-type => text/html contenttype => text/html */ DocProperties.put(lines[i].substring(0, Colon).trim().toLowerCase(), lines[i].substring(Colon + 2).trim()); DocProperties.put(lines[i].substring(0, Colon).trim().toLowerCase().replaceAll("-", ""), lines[i].substring(Colon + 2).trim()); } } } //for String cType = DocProperties.get("contenttype"); if (cType == null) return; cType = cType.toLowerCase(); if (cType.contains("charset")) { final Matcher m = CHARSET_PATTERN.matcher(cType); if (m.find() && m.groupCount() > 0) { try { String charset = m.group(1); charset = StringTools.normaliseEncoding(charset); if (CharEncoding.isSupported(charset)) DocProperties.put("charset", charset); } catch (IllegalStateException ise) { /* this shouldnt happen if m.groupCount > 0, but it does */ } } } }
From source file:org.terrier.indexing.WARC018Collection.java
/** Move the collection to the start of the next document. */ public boolean nextDocument() { DocProperties = new HashMap<String, String>(15); try {/* ww w . ja v a 2 s . c o m*/ warcrecord: while (true) { String line = readLine(); //logger.debug("Checking "+line + " for the magic warc header"); //look for a warc line if (line.startsWith("WARC/0.18")) { //logger.debug("Found warc header"); int headerSize = parseHeaders(true); //logger.debug("Parsed WARC headers in "+ headerSize + " bytes"); final long warc_response_length = Long.parseLong(DocProperties.get("content-length")); //logger.debug("length of http message is "+warc_response_length); if (!DocProperties.get("warc-type").equals("response")) { is.skip(warc_response_length); //-49 continue warcrecord; } headerSize = parseHeaders(false); //logger.debug("Parsed HTTP headers in "+ headerSize + " bytes"); DocProperties.put("docno", DocProperties.get(warc_docno_header)); DocProperties.put("url", DocProperties.get(warc_url_header)); DocProperties.put("crawldate", parseDate(DocProperties.get(warc_crawldate_header))); if (logger.isDebugEnabled()) logger.debug("Now working on document " + DocProperties.get("docno")); DocProperties.put("charset", desiredEncoding); //obtain the character set of the document and put in the charset property String cType = DocProperties.get("content-type"); //force UTF-8 for english documents - webpage isnt clear: //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings if (cType != null) { cType = cType.toLowerCase(); if (cType.contains("charset")) { final Matcher m = charsetMatchPattern.matcher(cType); if (m.find() && m.groupCount() > 0) { String charset = StringTools.normaliseEncoding(m.group(1)); if (CharEncoding.isSupported(charset)) DocProperties.put("charset", charset); } } } if (forceUTF8) DocProperties.put("charset", "utf-8"); documentsInThisFile++; currentDocumentBlobLength = warc_response_length - headerSize; //-16 return true; } if (eof) { if (documentsInThisFile == 0) { logger.warn(this.getClass().getSimpleName() + " found no documents in " + FilesToProcess.get(FileNumber - 1) + ". " + "Perhaps trec.collection.class is wrongly set or decompression failed."); } if (!openNextFile()) return false; } } } catch (IOException ioe) { logger.error("IOException while reading WARC format collection file" + ioe); } return false; }
From source file:org.terrier.indexing.WARC10Collection.java
/** Move the collection to the start of the next document. */ public boolean nextDocument() { DocProperties = new HashMap<String, String>(15); try {/*from w w w . ja v a2 s . co m*/ warcrecord: while (true) { String line = readLine(); //logger.debug("Checking "+line + " for the magic warc header, found = " + line.startsWith("WARC/1.0")); //look for a warc line if (line.startsWith("WARC/1.0")) { //logger.debug("Found warc header"); int headerSize = parseHeaders(true); //logger.debug("Parsed WARC headers in "+ headerSize + " bytes"); final long warc_response_length = Long.parseLong(DocProperties.get("content-length")); //logger.debug("length of following message is "+warc_response_length); if (!DocProperties.get("warc-type").equals("response")) { //System.err.println("Skipping warc-type of " + DocProperties.get("warc-type")); is.skip(warc_response_length); continue warcrecord; } headerSize = 0; //ignore all content before the HTTP line, e.g. clueweb12-1216wb-96-28178 String statusLine = readLine(); headerSize += readLineByteCount; while (!statusLine.startsWith("HTTP/")) { statusLine = readLine(); headerSize += readLineByteCount; } final String[] statusParts = statusLine.split(" ", 3); int code; try { code = Integer.parseInt(statusParts[1]); } catch (NumberFormatException nfe) { throw new IOException("Invalid status line '" + statusLine + "' for " + DocProperties.get(warc_docno_header)); } headerSize += parseHeaders(false); if (code == 301 || code == 302) { processRedirect(DocProperties.get(warc_url_header), DocProperties.get("location")); //there is no need to skip, as there should only be headers //is.skip(warc_response_length - headerSize); continue warcrecord; } else if (code != 200) { assert false : "Unsupported status code: " + code; } //logger.debug("Parsed HTTP headers in "+ headerSize + " bytes"); DocProperties.put("docno", DocProperties.get(warc_docno_header)); DocProperties.put("url", DocProperties.get(warc_url_header)); DocProperties.put("crawldate", parseDate(DocProperties.get(warc_crawldate_header))); if (logger.isDebugEnabled()) logger.debug("Now working on document " + DocProperties.get("docno")); DocProperties.put("charset", desiredEncoding); //obtain the character set of the document and put in the charset property String cType = DocProperties.get("content-type"); if (cType != null) { cType = cType.toLowerCase(); if (cType.contains("charset")) { final Matcher m = charsetMatchPattern.matcher(cType); if (m.find() && m.groupCount() > 0) { String charset = StringTools.normaliseEncoding(m.group(1)); if (CharEncoding.isSupported(charset)) DocProperties.put("charset", charset); } } } //force UTF-8 for english documents - webpage isnt clear: //http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings if (forceUTF8) DocProperties.put("charset", "utf-8"); documentsInThisFile++; currentDocumentBlobLength = Math.max(0, warc_response_length - headerSize); //document clueweb12-0000tw-00-00010 has no content, causes a negative currentDocumentBlobLength //assert currentDocumentBlobLength > 0 : // "document "+DocProperties.get(warc_docno_header)+" must have size: " // +"warc_response_length - headerSize - 26 = " // + warc_response_length + " - " + headerSize + " -26 " + " = " + currentDocumentBlobLength; return true; } if (eof) { if (documentsInThisFile == 0) { String sourcemsg = FilesToProcess.size() > 0 ? FilesToProcess.get(FileNumber - 1) : "input stream"; logger.warn(this.getClass().getSimpleName() + " found no documents in " + sourcemsg + ". " + "Perhaps trec.collection.class is wrongly set or decompression failed."); } if (!openNextFile()) return false; } } } catch (IOException ioe) { logger.error("IOException while reading WARC format collection file" + ioe); } return false; }