Java HTML Parse Jsoup getDoc(String path)

Here you can find the source of getDoc(String path)

Description

get Doc

License

Apache License

Declaration

public static final Document getDoc(String path) throws IOException 

Method Source Code


//package com.java2s;
//License from project: Apache License 

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class Main {
    public static final String LINE_START = "LINE_START_SUB";

    public static final Document getDoc(String path) throws IOException {
        String fileContent = readFile(path, StandardCharsets.UTF_8).replaceAll("(?i)<br[^>]*>", LINE_START)
                .replaceAll("\n", LINE_START);

        return Jsoup.parse(fileContent);

    }// w w w .  j  a  v  a2  s  .c  om

    private static String readFile(String path, Charset encoding) throws IOException {
        byte[] encoded = Files.readAllBytes(Paths.get(path));
        return new String(encoded, encoding);
    }
}

Related

  1. fixHtml(String htmlContent, String outputFile, String contentFile)
  2. getContentFromHTML(String html)
  3. getDistinctImageUrls(String htmlContent)
  4. getDoc(Connection conn)
  5. getDoc(File file)
  6. getDoc(String url)
  7. getDoctypeName(InputStream s)
  8. getErrorMessage(String htmlStr)
  9. getExplanation(String html)