Java HTML Parse Jsoup parse(InputStream input, String documentIRI, String encoding)

Here you can find the source of parse(InputStream input, String documentIRI, String encoding)

Description

parse

License

Apache License

Declaration

public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException 

Method Source Code


//package com.java2s;
/*//from w  w w . j a v a  2  s  . c  o  m
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.util.Arrays;

public class Main {
    public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
        //Jsoup doesn't allow null document URIs
        if (documentIRI == null) {
            documentIRI = "";
        }

        //workaround for Jsoup issue #1009
        if (encoding == null) {

            int c;
            do {
                c = input.read();
            } while (c != -1 && Character.isWhitespace(c));

            if (c != -1) {
                int capacity = 256;
                byte[] bytes = new byte[capacity];
                int length = 0;
                bytes[length++] = (byte) c;

                if (c == '<') {
                    c = input.read();
                    if (c != -1) {
                        bytes[length++] = (byte) c;
                        if (c == '?') {
                            c = input.read();

                            while (c != -1) {
                                if (length == capacity) {
                                    capacity *= 2;
                                    bytes = Arrays.copyOf(bytes, capacity);
                                }
                                bytes[length++] = (byte) c;

                                if (c == '>') {
                                    if (length >= 20 && bytes[length - 2] == '?') {
                                        String decl = "<" + new String(bytes, 2, length - 4) + ">";
                                        org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI,
                                                Parser.xmlParser());
                                        for (org.jsoup.nodes.Element el : doc.children()) {
                                            if ("xml".equalsIgnoreCase(el.tagName())) {
                                                String enc = el.attr("encoding");
                                                if (enc != null && !enc.isEmpty()) {
                                                    encoding = enc;
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    break;
                                }

                                c = input.read();
                            }
                        }
                    }

                }

                input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
            }

        }

        //Use Parser.htmlParser() to parse javascript correctly
        return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
    }
}

Related

  1. getPlainTextFromHtml(String html)
  2. getTitle(String htmlContent)
  3. htmlArray2textArray(List htmlArray)
  4. isHTMLEmpty(String textToCheck)
  5. parse(final String html)
  6. parse(String html)
  7. parse(String html)
  8. parse(String html)
  9. parse(URL url, int timeout)