List of usage examples for org.jsoup.nodes Document outputSettings
OutputSettings outputSettings
To view the source code for org.jsoup.nodes Document outputSettings.
Click Source Link
From source file:com.switchfly.inputvalidation.sanitizer.StripHtmlSanitizer.java
@Override public String execute(String content) { if (StringUtils.isBlank(content)) { return content; }/*from w ww. j a v a2 s . c om*/ Document document = Jsoup.parse(content); document.outputSettings().escapeMode(Entities.EscapeMode.xhtml); for (Element element : document.select("script,link,iframe,style")) { element.remove(); } return document.text(); }
From source file:com.betel.flowers.pdf.util.XMLtoHtml.java
public String checkHTML(String htmlString) throws IOException { String checkedhtml = null;/*from w w w. ja va2s .c o m*/ try { Document docHtml = Jsoup.parse(htmlString); docHtml.outputSettings().syntax(Document.OutputSettings.Syntax.xml); String value = new String(docHtml.html()); checkedhtml = StringEscapeUtils.unescapeHtml4(value); } catch (Exception ex) { throw ex; } return checkedhtml; }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String[] extractHtmlSection(MedicalInformations.MedicalInformation m) { // Extract section titles and section ids MedicalInformations.MedicalInformation.Sections med_sections = m.getSections(); List<MedicalInformations.MedicalInformation.Sections.Section> med_section_list = med_sections.getSection(); Document doc = Jsoup.parse(m.getContent()); doc.outputSettings().escapeMode(EscapeMode.xhtml); // Clean html code HtmlUtils html_utils = new HtmlUtils(m.getContent()); html_utils.clean();//from w w w. j a v a 2s . c om // Extract registration number (swissmedic no5) String regnr_str = ""; if (DB_LANGUAGE.equals("de")) regnr_str = html_utils.extractRegNrDE(m.getTitle()); else if (DB_LANGUAGE.equals("fr")) regnr_str = html_utils.extractRegNrFR(m.getTitle()); // Sanitize html String html_sanitized = ""; // First check for bad boys (version=1! but actually version>1!) if (!m.getVersion().equals("1") || m.getContent().substring(0, 20).contains("xml")) { for (int i = 1; i < 22; ++i) { html_sanitized += html_utils.sanitizeSection(i, m.getTitle(), DB_LANGUAGE); } html_sanitized = "<div id=\"monographie\">" + html_sanitized + "</div>"; } else { html_sanitized = m.getContent(); } // Update "Packungen" section and extract therapeutisches index List<String> mTyIndex_list = new ArrayList<String>(); String mContent_str = updateSectionPackungen(m.getTitle(), package_info, regnr_str, html_sanitized, mTyIndex_list); // Add meta-tag and link mContent_str = mContent_str.replaceAll("<head>", "<head>" + "<link href=\"amiko_stylesheet.css\" rel=\"stylesheet\" type=\"text/css\"></>" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"); m.setContent(mContent_str); // Fix problem with wrong div class in original Swissmedic file if (DB_LANGUAGE.equals("de")) { m.setStyle(m.getStyle().replaceAll("untertitel", "untertitle")); m.setStyle(m.getStyle().replaceAll("untertitel1", "untertitle1")); } // Correct formatting error introduced by Swissmedic m.setAuthHolder(m.getAuthHolder().replaceAll("&", "&")); // Extracts only *first* registration number /* List<String> swissmedicno5_list = Arrays.asList(regnr_str.split("\\s*,\\s*")); String[] swno5_content_map = {swissmedicno5_list.get(0), mContent_str}; */ // Extract *all* registration numbers String[] swno5_content_map = { regnr_str, mContent_str }; return swno5_content_map; //mContent_str; }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String addHeaderToXml(String xml_str) { Document mDoc = Jsoup.parse("<kompendium>\n" + xml_str + "</kompendium>"); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // Add date/*from ww w . j av a2s . com*/ Date df = new Date(); String date_str = df.toString(); mDoc.select("kompendium").first().prependElement("date"); mDoc.select("date").first().text(date_str); // Add language mDoc.select("date").after("<lang></lang>"); if (DB_LANGUAGE.equals("de")) mDoc.select("lang").first().text("DE"); else if (DB_LANGUAGE.equals("fr")) mDoc.select("lang").first().text("FR"); // Fool jsoup.parse which seems to have its own "life" mDoc.select("tbody").unwrap(); Elements img_elems = mDoc.select("img"); for (Element img_e : img_elems) { if (!img_e.hasAttr("src")) img_e.unwrap(); } mDoc.select("img").tagName("image"); String final_xml_str = mDoc.select("kompendium").first().outerHtml(); return final_xml_str; }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");// w w w.j a v a 2 s . c o m if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java
private String fetchStory(Website website) throws IOException { int roll = 0; String result;//from ww w . j a v a 2s. c o m int resultLength; int resultLines; //noinspection ConstantConditions do { roll++; final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get(); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); logger.info("Fetched a story from {}", doc.location()); final Element story = doc.select(website.getCssQuery()).first(); if (story == null) { return ERROR_COULD_NOT_PARSE; } story.select("div").remove(); story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), ""))); story.select("br").after("\\n"); story.select("p").before("\\n\\n"); final String storyHtml = story.html().replaceAll("\\\\n", "\n"); result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) .trim(); resultLength = result.length(); resultLines = countLines(result); } while (CONFIG_REROLL_LONG_STORIES && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES) && roll <= CONFIG_MAX_ROLLS); return result; }
From source file:ac.simons.oembed.Oembed.java
public String transformDocumentString(final String documentHtml) { final Document rv = transformDocument(documentHtml); rv.outputSettings().prettyPrint(false).escapeMode(EscapeMode.xhtml); return rv.body().html(); }
From source file:com.isomorphic.maven.packaging.Downloads.java
/** * Interrogates the remote server for a list of hyperlinks matching the given distribution's {@link Distribution#getRemoteIndexFilter() filter}. * //from w w w . ja v a 2 s . com * @param dist the build in which some files should exist * @return a String array of html href attributes * @throws MojoExecutionException */ private String[] list(Distribution dist) throws MojoExecutionException { HttpGet request = new HttpGet(dist.getRemoteIndex()); HttpResponse response; try { LOGGER.debug("Requesting list of files from {}{}", DOMAIN, dist.getRemoteIndex()); response = httpClient.execute(host, request); } catch (Exception e) { throw new MojoExecutionException("Error issuing GET request for bundle at '" + request + "'", e); } Document doc; try { String html = EntityUtils.toString(response.getEntity()); doc = Jsoup.parse(html); doc.outputSettings().prettyPrint(true); } catch (Exception e) { throw new MojoExecutionException("Error processing response from '" + request + "'", e); } List<String> result = new ArrayList<String>(); Elements links = doc.select(dist.getRemoteIndexFilter()); for (Element element : links) { String href = element.attr("href"); result.add(href); } if (result.isEmpty()) { String msg = String.format("No downloads found at '%s%s'. Response from server: \n\n%s\n", DOMAIN, dist.getRemoteIndex(), doc.html()); LOGGER.warn(msg); } return result.toArray(new String[0]); }
From source file:com.maxl.java.aips2sqlite.PseudoExpertInfo.java
/** * Extracts all the important information from the pseudo "Fachinfo" file * @param pseudo_info_file/*from w ww. j a va 2 s. c o m*/ */ public boolean extractInfo(int idx, FileInputStream pseudo_info_file) { mMedi = new MedicalInformations.MedicalInformation(); mSectionContent = new ArrayList<String>(); mSectionTitles = new ArrayList<String>(); mBarCodes = new ArrayList<String>(); m_list_of_packages = new ArrayList<String>(); String mediTitle = ""; String mediAuthor = ""; String mediPseudoTag = ""; String mediHtmlContent = ""; StringBuilder content = new StringBuilder(); try { // Read in docx file XWPFDocument docx = new XWPFDocument(pseudo_info_file); // Get iterator through all paragraphs Iterator<XWPFParagraph> para = docx.getParagraphsIterator(); // Pre-process input stream to extract paragraph titles boolean goodToGo = false; while (para.hasNext()) { List<XWPFRun> runs = para.next().getRuns(); if (!runs.isEmpty()) { for (XWPFRun r : runs) { // bold and italics identifies section title! if (r.isBold()) { // && r.isItalic()) { String pText = r.getParagraph().getText(); // These are the first chapter titles (DE and FR) if (pText.equals("Zusammensetzung") || pText.equals("Composition")) goodToGo = true; if (goodToGo == true) mSectionTitles.add(pText); } } } } // Add "nil" at the end mSectionTitles.add("nil"); if (mLanguage.equals("de") && !mSectionTitles.get(0).equals("Zusammensetzung")) return false; if (mLanguage.equals("fr") && !mSectionTitles.get(0).equals("Composition")) return false; // Reset iterator para = docx.getParagraphsIterator(); // Init list for section content for (int i = 0; i < mSectionTitles.size(); ++i) mSectionContent.add(i, ""); // Get title if (para.hasNext()) mediTitle = para.next().getParagraphText(); // Get author while using "Medizinprodukt" as tag String prevParaText = ""; while (para.hasNext()) { String paraText = para.next().getParagraphText(); // If this word is not found, then no pseudo FI will be produced if (paraText.equals("Medizinprodukt") || paraText.equals("Dispositif mdical")) { mediPseudoTag = paraText; mediAuthor = prevParaText; break; } prevParaText = paraText; } // Get section titles + sections + ean codes boolean isSectionPackungen = false; int numSection = 0; // Init with section1 and title String sectionId_str = ""; String sectionTitle_str = ""; mEanCodes_str = ""; mSectionIds_str = "section1,"; mSectionTitles_str = mediTitle + ","; m_pack_info_str = ""; // This is the EAN code pattern Pattern pattern = Pattern.compile("^[0-9]{13}"); // Loop through it, identifying medication title, author, section titles and corresponding titles while (para.hasNext()) { String paraText = para.next().getParagraphText(); if (paraText.equals(mSectionTitles.get(numSection))) { // ->> Get section title isSectionPackungen = false; // Get section title if (numSection < mSectionTitles.size()) numSection++; // Section "Packungen" is special if (paraText.equals("Packungen") || paraText.equals("Prsentation")) { isSectionPackungen = true; } // Close previous div if (numSection > 1) content.append("</div>"); // Create html sectionId_str = "section" + (numSection + 1); // section1 is reserved for the MonTitle sectionTitle_str = mSectionTitles.get(numSection - 1); content.append("<div class=\"paragraph\" id=\"" + sectionId_str + "\">"); content.append("<div class=\"absTitle\">" + sectionTitle_str + "</div>"); // Generate section id string mSectionIds_str += (sectionId_str + ","); // Generate titles string mSectionTitles_str += (sectionTitle_str + ";"); } else { // ->> Get section content String s = mSectionContent.get(numSection - 1); mSectionContent.set(numSection - 1, s + paraText + " "); // Create html content.append("<p class=\"spacing1\">" + paraText + "</p>"); // Extract EAN codes and start positions Matcher matcher = pattern.matcher(paraText); while (matcher.find()) { String eanCode = matcher.group(); mEanCodes_str += (eanCode + ", "); if (!eanCode.isEmpty()) { String pup = ""; String efp = ""; String fep = ""; String fap = ""; String vat = ""; String size = ""; String units = ""; String swissmedic_cat = ""; String pharma_code = ""; int visible = 0xff; int has_free_samples = 0x00; // by default no free samples // Exctract fep and fap pricing information // FAP = Fabrikabgabepreis = EFP? // FEP = Fachhandelseinkaufspreis // EFP = FAP < FEP < PUP if (m_map_products != null && eanCode != null && m_map_products.containsKey(eanCode)) { Product product = m_map_products.get(eanCode); if (product.efp > 0.0f) efp = String.format("CHF %.2f", product.efp); if (product.pp > 0.0f) pup = String.format("CHF %.2f", product.pp); if (product.fap > 0.0f) fap = String.format("CHF %.2f", product.fap); if (product.fep > 0.0f) fep = String.format("CHF %.2f", product.fep); if (product.vat > 0.0f) vat = String.format("%.2f", product.vat); if (product.size != null && !product.size.isEmpty()) size = product.size; if (product.units != null && product.units.length > 0) units = product.units[0]; if (product.swissmedic_cat != null && !product.swissmedic_cat.isEmpty()) swissmedic_cat = product.swissmedic_cat; if (product.pharmacode != null && !product.pharmacode.isEmpty()) pharma_code = product.pharmacode; visible = product.visible; has_free_samples = product.free_sample; } m_list_of_packages.add(mediTitle.toUpperCase() + ", " + units + ", " + size + "|" + size + "|" + units + "|" + efp + "|" + pup + "|" + fap + "|" + fep + "|" + vat + "|" + swissmedic_cat + ",,|" + eanCode + "|" + pharma_code + "|" + visible + "|" + has_free_samples + "\n"); // Generate bar codes BarCode bc = new BarCode(); String barcodeImg64 = bc.encode(eanCode); mBarCodes.add("<p class=\"spacing1\">" + barcodeImg64 + "</p>"); content.append(barcodeImg64); } } // Generate section Packungen for search result if (isSectionPackungen) m_pack_info_str += (paraText + "\n"); } } /* // Add chapter "Barcodes" content.append("<p class=\"paragraph\"></p><div class=\"absTitle\">" + "Barcodes" + "</div>"); for (String bcode : mBarCodes) content.append(bcode); */ // Remove last comma from mEanCodes_str if (!mEanCodes_str.isEmpty()) mEanCodes_str = mEanCodes_str.substring(0, mEanCodes_str.length() - 2); // Remove last \n from mSectionPackungen_str if (!m_pack_info_str.isEmpty()) m_pack_info_str = m_pack_info_str.substring(0, m_pack_info_str.length() - 1); // Set title, autor mMedi.setTitle(mediTitle); mMedi.setAuthHolder(mediAuthor); mMedi.setAtcCode("PSEUDO"); mMedi.setSubstances(mediTitle); System.out.println(idx + " - " + mediTitle + ": " + mEanCodes_str); // Close previous div + monographie div content.append("</div></div>"); String title = "<div class=\"MonTitle\" id=\"section1\">" + mediTitle + "</div>"; String author = "<div class=\"ownerCompany\"><div style=\"text-align: right;\">" + mediAuthor + "</div></div>"; // Set "Medizinprodukt" label String pseudo = "<p class=\"spacing1\">" + mediPseudoTag + "</p>"; // Set medi content mediHtmlContent = "<html><head></head><body><div id=\"monographie\">" + title + author + pseudo + content.toString() + "</div></body></html>"; // Generate clean html file Document doc = Jsoup.parse(mediHtmlContent); doc.outputSettings().escapeMode(EscapeMode.xhtml); doc.outputSettings().charset("UTF-8"); doc.outputSettings().prettyPrint(true); doc.outputSettings().indentAmount(1); mediHtmlContent = doc.html(); // Set html content mMedi.setContent(mediHtmlContent); // Add to DB addToDB(); return true; } catch (IOException e) { e.printStackTrace(); return false; } }
From source file:com.maxl.java.aips2sqlite.RealExpertInfo.java
public void process() { // Get stop words first getStopWords();/*from w w w . j av a 2 s .c o m*/ // Extract EPha SwissmedicNo5 to ATC map extractSwissmedicNo5ToAtcMap(); // Extract package information (this is the heavy-duty bit) extractPackageInfo(); // Extract Swiss DRG information extractSwissDRGInfo(); try { // Load CSS file: used only for self-contained xml files String amiko_style_v1_str = FileOps.readCSSfromFile(Constants.FILE_STYLE_CSS_BASE + "v1.css"); // Create error report file ParseReport parse_errors = null; if (CmlOptions.GENERATE_REPORTS == true) { parse_errors = new ParseReport(Constants.FILE_PARSE_REPORT, CmlOptions.DB_LANGUAGE, "html"); if (CmlOptions.DB_LANGUAGE.equals("de")) parse_errors.addHtmlHeader("Schweizer Arzneimittel-Kompendium", Constants.FI_DB_VERSION); else if (CmlOptions.DB_LANGUAGE.equals("fr")) parse_errors.addHtmlHeader("Compendium des Mdicaments Suisse", Constants.FI_DB_VERSION); } // Create indications report file BufferedWriter bw_indications = null; Map<String, String> tm_indications = new TreeMap<String, String>(); if (CmlOptions.INDICATIONS_REPORT == true) { ParseReport indications_report = new ParseReport(Constants.FILE_INDICATIONS_REPORT, CmlOptions.DB_LANGUAGE, "txt"); bw_indications = indications_report.getBWriter(); } /* * Add pseudo Fachinfos to SQLite database */ int tot_pseudo_counter = 0; if (CmlOptions.ADD_PSEUDO_FI == true) { PseudoExpertInfo pseudo_fi = new PseudoExpertInfo(m_sql_db, CmlOptions.DB_LANGUAGE, m_map_products); // Process tot_pseudo_counter = pseudo_fi.process(); System.out.println(""); } /* * Add real Fachinfos to SQLite database */ // Initialize counters for different languages int med_counter = 0; int tot_med_counter = 0; int missing_regnr_str = 0; int missing_pack_info = 0; int missing_atc_code = 0; int errors = 0; String fi_complete_xml = ""; // First pass is always with DB_LANGUAGE set to German! (most complete information) // The file dumped in ./reports is fed to AllDown.java to generate a multilingual ATC code / ATC class file, e.g. German - French Set<String> atccode_set = new TreeSet<String>(); // Treemap for owner error report (sorted by key) TreeMap<String, ArrayList<String>> tm_owner_error = new TreeMap<String, ArrayList<String>>(); HtmlUtils html_utils = null; System.out.println("Processing real Fachinfos..."); for (MedicalInformations.MedicalInformation m : m_med_list) { // --> Read FACHINFOS! <-- if (m.getLang().equals(CmlOptions.DB_LANGUAGE) && m.getType().equals("fi")) { // Database contains less than 5000 medis - this is a safe upperbound! if (tot_med_counter < 5000) { // Trim titles of leading and trailing spaces m.setTitle(m.getTitle().trim()); // Extract section titles and section ids MedicalInformations.MedicalInformation.Sections med_sections = m.getSections(); List<MedicalInformations.MedicalInformation.Sections.Section> med_section_list = med_sections .getSection(); String ids_str = ""; String titles_str = ""; for (MedicalInformations.MedicalInformation.Sections.Section s : med_section_list) { ids_str += (s.getId() + ","); titles_str += (s.getTitle() + ";"); } Document doc = Jsoup.parse(m.getContent()); doc.outputSettings().escapeMode(EscapeMode.xhtml); html_utils = new HtmlUtils(m.getContent()); html_utils.setLanguage(CmlOptions.DB_LANGUAGE); html_utils.clean(); // Extract registration number (swissmedic no5) String regnr_str = ""; if (CmlOptions.DB_LANGUAGE.equals("de")) regnr_str = html_utils.extractRegNrDE(m.getTitle()); else if (CmlOptions.DB_LANGUAGE.equals("fr")) regnr_str = html_utils.extractRegNrFR(m.getTitle()); // Pattern matcher for regnr command line option, (?s) searches across multiple lines Pattern regnr_pattern = Pattern.compile("(?s).*\\b" + CmlOptions.OPT_MED_REGNR); if (m.getTitle().toLowerCase().startsWith(CmlOptions.OPT_MED_TITLE.toLowerCase()) && regnr_pattern.matcher(regnr_str).find() && m.getAuthHolder().toLowerCase() .startsWith(CmlOptions.OPT_MED_OWNER.toLowerCase())) { System.out.println(tot_med_counter + " - " + m.getTitle() + ": " + regnr_str); if (regnr_str.isEmpty()) { errors++; if (CmlOptions.GENERATE_REPORTS == true) { parse_errors.append("<p style=\"color:#ff0099\">ERROR " + errors + ": reg. nr. could not be parsed in AIPS.xml (swissmedic) - " + m.getTitle() + " (" + regnr_str + ")</p>"); // Add to owner errors ArrayList<String> error = tm_owner_error.get(m.getAuthHolder()); if (error == null) error = new ArrayList<String>(); error.add(m.getTitle() + ";regnr"); tm_owner_error.put(m.getAuthHolder(), error); } missing_regnr_str++; regnr_str = ""; } // Associate ATC classes and subclasses (atc_map) String atc_class_str = ""; String atc_description_str = ""; // This bit is necessary because the ATC Code in the AIPS DB is broken sometimes String atc_code_str = ""; boolean atc_error_found = false; // Use EPha ATC Codes, AIPS is fallback solution String authNrs = m.getAuthNrs(); if (authNrs != null) { // Deal with multi-swissmedic no5 case String regnrs[] = authNrs.split(","); // Use set to avoid duplicate ATC codes Set<String> regnrs_set = new LinkedHashSet<>(); // Loop through EPha ATC codes for (String r : regnrs) { regnrs_set.add(m_smn5_atc_map.get(r.trim())); } // Iterate through set and format nicely for (String r : regnrs_set) { if (atc_code_str == null || atc_code_str.isEmpty()) atc_code_str = r; else atc_code_str += "," + r; } } else atc_error_found = true; // Notify any other problem with the EPha ATC codes if (atc_code_str == null || atc_code_str.isEmpty()) atc_error_found = true; // Fallback solution if (atc_error_found == true) { if (m.getAtcCode() != null && !m.getAtcCode().equals("n.a.") && m.getAtcCode().length() > 1) { atc_code_str = m.getAtcCode(); atc_code_str = atc_code_str.replaceAll("–", "("); atc_code_str = atc_code_str.replaceAll("Code", "").replaceAll("ATC", "") .replaceAll(" ", "").replaceAll("\\(.*", "").replaceAll("/", ",") .replaceAll("[^A-Za-z0-9,]", ""); if (atc_code_str.charAt(1) == 'O') { // E.g. Ascosal Brausetabletten atc_code_str = atc_code_str.substring(0, 1) + '0' + atc_code_str.substring(2); } if (atc_code_str.length() > 7) { if (atc_code_str.charAt(7) != ',' || atc_code_str.length() != 15) atc_code_str = atc_code_str.substring(0, 7); } } else { // Work backwards using m_atc_map and m.getSubstances() String substances = m.getSubstances(); if (substances != null) { if (m_atc_map.containsValue(substances)) { for (Map.Entry<String, String> entry : m_atc_map.entrySet()) { if (entry.getValue().equals(substances)) { atc_code_str = entry.getKey(); } } } } } atc_error_found = false; } // Now let's clean the m.getSubstances() String substances = m.getSubstances(); if ((substances == null || substances.length() < 3) && atc_code_str != null) { substances = m_atc_map.get(atc_code_str); } // Set clean substances m.setSubstances(substances); // Set clean ATC Code m.setAtcCode(atc_code_str); // System.out.println("ATC -> " + atc_code_str + ": " + substances); if (atc_code_str != null) { // \\s -> whitespace character, short for [ \t\n\x0b\r\f] // atc_code_str = atc_code_str.replaceAll("\\s",""); // Take "leave" of the tree (most precise classification) String a = m_atc_map.get(atc_code_str); if (a != null) { atc_description_str = a; atccode_set.add(atc_code_str + ": " + a); } else { // Case: ATC1,ATC2 if (atc_code_str.length() == 15) { String[] codes = atc_code_str.split(","); if (codes.length > 1) { String a1 = m_atc_map.get(codes[0]); if (a1 == null) { atc_error_found = true; a1 = "k.A."; } String a2 = m_atc_map.get(codes[1]); if (a2 == null) { atc_error_found = true; a2 = "k.A."; } atc_description_str = a1 + "," + a2; } } else if (m.getSubstances() != null) { // Fallback in case nothing else works atc_description_str = m.getSubstances(); // Work backwards using m_atc_map and m.getSubstances(), change ATC code if (atc_description_str != null) { if (m_atc_map.containsValue(atc_description_str)) { for (Map.Entry<String, String> entry : m_atc_map.entrySet()) { if (entry.getValue().equals(atc_description_str)) { m.setAtcCode(entry.getKey()); } } } } } else { atc_error_found = true; if (CmlOptions.DB_LANGUAGE.equals("de")) atc_description_str = "k.A."; else if (CmlOptions.DB_LANGUAGE.equals("fr")) atc_description_str = "n.s."; } } // Read out only two levels (L1, L3, L4, L5) for (int i = 1; i < 6; i++) { if (i != 2) { String atc_key = ""; if (i <= atc_code_str.length()) atc_key = atc_code_str.substring(0, i); char sep = (i >= 4) ? '#' : ';'; // #-separator between L4 and L5 if (atc_key != null) { String c = m_atc_map.get(atc_key); if (c != null) { atccode_set.add(atc_key + ": " + c); atc_class_str += (c + sep); } else { atc_class_str += sep; } } else { atc_class_str += sep; } } } // System.out.println("atc class = " + atc_class_str); // If DRG medication, add to atc_description_str ArrayList<String> drg = m_swiss_drg_info.get(atc_code_str); if (drg != null) { atc_description_str += (";DRG"); } } if (atc_error_found) { errors++; if (CmlOptions.GENERATE_REPORTS) { parse_errors.append("<p style=\"color:#0000bb\">ERROR " + errors + ": Broken or missing ATC-Code-Tag in AIPS.xml (Swissmedic) or ATC index (Wido) - " + m.getTitle() + " (" + regnr_str + ")</p>"); // Add to owner errors ArrayList<String> error = tm_owner_error.get(m.getAuthHolder()); if (error == null) error = new ArrayList<String>(); error.add(m.getTitle() + ";atccode"); tm_owner_error.put(m.getAuthHolder(), error); } System.err.println(">> ERROR: " + tot_med_counter + " - no ATC-Code found in the XML-Tag \"atcCode\" - (" + regnr_str + ") " + m.getTitle()); missing_atc_code++; } // Additional info stored in add_info_map String add_info_str = ";"; List<String> rnr_list = Arrays.asList(regnr_str.split("\\s*, \\s*")); if (rnr_list.size() > 0) add_info_str = m_add_info_map.get(rnr_list.get(0)); // Sanitize html String html_sanitized = ""; // First check for bad boys (version=1! but actually version>1!) if (!m.getVersion().equals("1") || m.getContent().substring(0, 20).contains("xml")) { for (int i = 1; i < 22; ++i) { html_sanitized += html_utils.sanitizeSection(i, m.getTitle(), m.getAuthHolder(), CmlOptions.DB_LANGUAGE); } html_sanitized = "<div id=\"monographie\">" + html_sanitized + "</div>"; } else { html_sanitized = m.getContent(); } // Add author number html_sanitized = html_sanitized.replaceAll("<div id=\"monographie\">", "<div id=\"monographie\" name=\"" + m.getAuthNrs() + "\">"); // Add Footer, timestamp in RFC822 format DateFormat dateFormat = new SimpleDateFormat("EEE', 'dd' 'MMM' 'yyyy' 'HH:mm:ss' 'Z", Locale.getDefault()); Date date = new Date(); String footer_str = "<p class=\"footer\">Auto-generated by <a href=\"https://github.com/zdavatz/aips2sqlite\">aips2sqlite</a> on " + dateFormat.format(date) + "</p>"; // html_sanitized += footer_str; html_sanitized = html_sanitized.replaceAll("</div>$", footer_str + "</div>"); // Extract section indications String section_indications = ""; if (CmlOptions.DB_LANGUAGE.equals("de")) { String sstr1 = "Indikationen/Anwendungsmglichkeiten"; String sstr2 = "Dosierung/Anwendung"; if (html_sanitized.contains(sstr1) && html_sanitized.contains(sstr2)) { int idx1 = html_sanitized.indexOf(sstr1) + sstr1.length(); int idx2 = html_sanitized.substring(idx1, html_sanitized.length()) .indexOf(sstr2); try { section_indications = html_sanitized.substring(idx1, idx1 + idx2); } catch (StringIndexOutOfBoundsException e) { e.printStackTrace(); } } } else if (CmlOptions.DB_LANGUAGE.equals("fr")) { String sstr1 = "Indications/Possibilits demploi"; String sstr2 = "Posologie/Mode demploi"; html_sanitized = html_sanitized.replaceAll("Indications/Possibilits d'emploi", sstr1); html_sanitized = html_sanitized.replaceAll("Posologie/Mode d'emploi", sstr2); html_sanitized = html_sanitized.replaceAll("Indications/possibilits demploi", sstr1); html_sanitized = html_sanitized.replaceAll("Posologie/mode demploi", sstr2); if (html_sanitized.contains(sstr1) && html_sanitized.contains(sstr2)) { int idx1 = html_sanitized.indexOf(sstr1) + sstr1.length(); int idx2 = html_sanitized.substring(idx1, html_sanitized.length()) .indexOf(sstr2); try { section_indications = html_sanitized.substring(idx1, idx1 + idx2); } catch (StringIndexOutOfBoundsException e) { e.printStackTrace(); } } } // Remove all p's, div's, span's and sup's section_indications = section_indications.replaceAll("\\<p.*?\\>", "") .replaceAll("</p>", ""); section_indications = section_indications.replaceAll("\\<div.*?\\>", "") .replaceAll("</div>", ""); section_indications = section_indications.replaceAll("\\<span.*?\\>", "") .replaceAll("</span>", ""); section_indications = section_indications.replaceAll("\\<sup.*?\\>", "") .replaceAll("</sup>", ""); // System.out.println(section_indications); if (CmlOptions.DB_LANGUAGE.equals("fr")) { // Remove apostrophes section_indications = section_indications.replaceAll("l'", "") .replaceAll("d'", ""); section_indications = section_indications.replaceAll("l", "").replaceAll("d", ""); } // Remove all URLs section_indications = section_indications.replaceAll( "\\b(http|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); // Remove list of type a) b) c) ... 1) 2) ... section_indications = section_indications.replaceAll("^\\w\\)", ""); // Remove numbers, commas, semicolons, parentheses, etc. section_indications = section_indications.replaceAll("[^A-Za-z\\xC0-\\xFF- ]", ""); // Generate long list of keywords LinkedList<String> wordsAsList = new LinkedList<String>( Arrays.asList(section_indications.split("\\s+"))); // Remove stop words Iterator<String> wordIterator = wordsAsList.iterator(); while (wordIterator.hasNext()) { // Note: This assumes there are no null entries in the list and all stopwords are stored in lower case String word = wordIterator.next().trim().toLowerCase(); if (word.length() < 3 || m.getTitle().toLowerCase().contains(word) || m_stop_words_hash.contains(word)) wordIterator.remove(); } section_indications = ""; for (String w : wordsAsList) { // Remove any leading dash or hyphen if (w.startsWith("-")) w = w.substring(1); section_indications += (w + ";"); if (CmlOptions.INDICATIONS_REPORT == true) { // Add to map (key->value), word = key, value = how many times used // Is word w already stored in treemap? String t_str = tm_indications.get(w); if (t_str == null) { t_str = m.getTitle(); tm_indications.put(w, t_str); } else { t_str += (", " + m.getTitle()); tm_indications.put(w, t_str); } } } /* * Update section "Packungen", generate packungen string for shopping cart, and extract therapeutisches index */ List<String> mTyIndex_list = new ArrayList<String>(); m_list_of_packages.clear(); m_list_of_eancodes.clear(); String mContent_str = updateSectionPackungen(m.getTitle(), m.getAtcCode(), m_package_info, regnr_str, html_sanitized, mTyIndex_list); m.setContent(mContent_str); // Check if mPackSection_str is empty AND command line option PLAIN is not active if (CmlOptions.PLAIN == false && m_pack_info_str.isEmpty()) { errors++; if (CmlOptions.GENERATE_REPORTS) { parse_errors.append("<p style=\"color:#bb0000\">ERROR " + errors + ": SwissmedicNo5 not found in Packungen.xls (Swissmedic) - " + m.getTitle() + " (" + regnr_str + ")</p>"); // Add to owner errors ArrayList<String> error = tm_owner_error.get(m.getAuthHolder()); if (error == null) error = new ArrayList<String>(); error.add(m.getTitle() + ";swissmedic5"); tm_owner_error.put(m.getAuthHolder(), error); } System.err.println(">> ERROR: " + tot_med_counter + " - SwissmedicNo5 not found in Swissmedic Packungen.xls - (" + regnr_str + ") " + m.getTitle()); missing_pack_info++; } // Fix problem with wrong div class in original Swissmedic file if (CmlOptions.DB_LANGUAGE.equals("de")) { m.setStyle(m.getStyle().replaceAll("untertitel", "untertitle")); m.setStyle(m.getStyle().replaceAll("untertitel1", "untertitle1")); } // Correct formatting error introduced by Swissmedic m.setAuthHolder(m.getAuthHolder().replaceAll("&", "&")); // Check if substances str has a '$a' and change it to '&alpha' if (m.getSubstances() != null) m.setSubstances(m.getSubstances().replaceAll("\\$a", "α")); if (CmlOptions.XML_FILE == true) { if (!regnr_str.isEmpty()) { // Generate and add hash code String html_str_no_timestamp = mContent_str .replaceAll("<p class=\"footer\">.*?</p>", ""); String hash_code = html_utils.calcHashCode(html_str_no_timestamp); // Add header to html file mContent_str = mContent_str.replaceAll("<head>", "<head>" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" name=\"fi_" + hash_code + "\"/>" + "<style>" + amiko_style_v1_str + "</style>"); // Note: the following line is not necessary! // m.setContent(mContent_str); // Add header to xml file String xml_str = html_utils.convertHtmlToXml("fi", m.getTitle(), mContent_str, regnr_str); xml_str = html_utils.addHeaderToXml("singlefi", xml_str); fi_complete_xml += (xml_str + "\n"); // Write to html and xml files to disk String name = m.getTitle(); // Replace all "Sonderzeichen" name = name.replaceAll("[^a-zA-Z0-9]+", "_"); if (CmlOptions.DB_LANGUAGE.equals("de")) { FileOps.writeToFile(mContent_str, Constants.FI_FILE_XML_BASE + "fi_de_html/", name + "_fi_de.html"); FileOps.writeToFile(xml_str, Constants.FI_FILE_XML_BASE + "fi_de_xml/", name + "_fi_de.xml"); } else if (CmlOptions.DB_LANGUAGE.equals("fr")) { FileOps.writeToFile(mContent_str, Constants.FI_FILE_XML_BASE + "fi_fr_html/", name + "_fi_fr.html"); FileOps.writeToFile(xml_str, Constants.FI_FILE_XML_BASE + "fi_fr_xml/", name + "_fi_fr.xml"); } } } int customer_id = 0; // Is the customer paying? If yes add customer id // str1.toLowerCase().contains(str2.toLowerCase()) if (m.getAuthHolder().toLowerCase().contains("desitin")) customer_id = 1; /* / HERE GO THE OTHER PAYING CUSTOMERS (increment customer_id respectively) */ // Extract (O)riginal / (G)enerika info String orggen_str = ""; if (add_info_str != null) { List<String> ai_list = Arrays.asList(add_info_str.split("\\s*;\\s*")); if (ai_list != null) { if (!ai_list.get(0).isEmpty()) orggen_str = ai_list.get(0); } } // @maxl: 25.04.2015 -> set orggen_str to nil (we are using add_info_str for group names now...) orggen_str = ""; /* * Add medis, titles and ids to database */ String packages_str = ""; for (String s : m_list_of_packages) packages_str += s; String eancodes_str = ""; for (String e : m_list_of_eancodes) eancodes_str += (e + ", "); if (!eancodes_str.isEmpty() && eancodes_str.length() > 2) eancodes_str = eancodes_str.substring(0, eancodes_str.length() - 2); m_sql_db.addExpertDB(m, packages_str, regnr_str, ids_str, titles_str, atc_description_str, atc_class_str, m_pack_info_str, orggen_str, customer_id, mTyIndex_list, section_indications); m_sql_db.addProductDB(m, packages_str, eancodes_str, m_pack_info_str); med_counter++; } } tot_med_counter++; } } System.out.println(); System.out.println("--------------------------------------------"); System.out.println("Total number of real Fachinfos: " + m_med_list.size()); System.out.println("Number of FI with package information: " + tot_med_counter); System.out.println("Number of FI in generated database: " + med_counter); System.out.println("Number of errors in db: " + errors); System.out.println("Number of missing reg. nr. (min): " + missing_regnr_str); System.out.println("Number of missing pack info: " + missing_pack_info); System.out.println("Number of missing atc codes: " + missing_atc_code); System.out.println("--------------------------------------------"); System.out.println("Total number of pseudo Fachinfos: " + tot_pseudo_counter); System.out.println("--------------------------------------------"); if (CmlOptions.XML_FILE == true) { fi_complete_xml = html_utils.addHeaderToXml("kompendium", fi_complete_xml); // Write kompendium xml file to disk if (CmlOptions.DB_LANGUAGE.equals("de")) { FileOps.writeToFile(fi_complete_xml, Constants.FI_FILE_XML_BASE, "fi_de.xml"); if (CmlOptions.ZIP_BIG_FILES) FileOps.zipToFile(Constants.FI_FILE_XML_BASE, "fi_de.xml"); } else if (CmlOptions.DB_LANGUAGE.equals("fr")) { FileOps.writeToFile(fi_complete_xml, Constants.FI_FILE_XML_BASE, "fi_fr.xml"); if (CmlOptions.ZIP_BIG_FILES) FileOps.zipToFile(Constants.FI_FILE_XML_BASE, "fi_fr.xml"); } // Copy stylesheet file to ./fis/ folders try { File src = new File(Constants.FILE_STYLE_CSS_BASE + "v1.css"); File dst_de = new File(Constants.FI_FILE_XML_BASE + "fi_de_html/"); File dst_fr = new File(Constants.FI_FILE_XML_BASE + "fi_fr_html/"); if (src.exists()) { if (dst_de.exists()) FileUtils.copyFileToDirectory(src, dst_de); if (dst_fr.exists()) FileUtils.copyFileToDirectory(src, dst_fr); } } catch (IOException e) { // TODO: Unhandled! } } if (CmlOptions.GENERATE_REPORTS == true) { parse_errors.append("<br/>"); parse_errors .append("<p>Number of medications with package information: " + tot_med_counter + "</p>"); parse_errors.append("<p>Number of medications in generated database: " + med_counter + "</p>"); parse_errors.append("<p>Number of errors in database: " + errors + "</p>"); parse_errors.append("<p>Number of missing registration number: " + missing_regnr_str + "</p>"); parse_errors.append("<p>Number of missing package info: " + missing_pack_info + "</p>"); parse_errors.append("<p>Number of missing atc codes: " + missing_atc_code + "</p>"); parse_errors.append("<br/>"); // Write and close report file parse_errors.writeHtmlToFile(); parse_errors.getBWriter().close(); // Write owner error report to file ParseReport owner_errors = new ParseReport(Constants.FILE_OWNER_REPORT, CmlOptions.DB_LANGUAGE, "html"); String report_style_str = FileOps.readCSSfromFile(Constants.FILE_REPORT_CSS_BASE + ".css"); owner_errors.addStyleSheet(report_style_str); if (CmlOptions.DB_LANGUAGE.equals("de")) owner_errors.addHtmlHeader("Schweizer Arzneimittel-Kompendium", Constants.FI_DB_VERSION); else if (CmlOptions.DB_LANGUAGE.equals("fr")) owner_errors.addHtmlHeader("Compendium des Mdicaments Suisse", Constants.FI_DB_VERSION); owner_errors.append(owner_errors.treemapToHtmlTable(tm_owner_error)); owner_errors.writeHtmlToFile(); owner_errors.getBWriter().close(); // Dump to console... /* for (Map.Entry<String, ArrayList<String>> entry : tm_owner_error.entrySet()) { String author = entry.getKey(); ArrayList<String> list = entry.getValue(); for (String error : list) System.out.println(author + " -> " + error); } */ } if (CmlOptions.INDICATIONS_REPORT == true) { // Dump everything to file bw_indications.write("Total number of words: " + tm_indications.size() + "\n\n"); for (Map.Entry<String, String> entry : tm_indications.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); bw_indications.write(key + " [" + value + "]\n"); } bw_indications.close(); } if (CmlOptions.DB_LANGUAGE.equals("de")) { // Dump set to file, currently we do this only for German File atccodes_file = new File("./output/atc_codes_used_set.txt"); if (!atccodes_file.exists()) { atccodes_file.getParentFile().mkdirs(); atccodes_file.createNewFile(); } FileWriter fwriter = new FileWriter(atccodes_file.getAbsoluteFile()); BufferedWriter bwriter = new BufferedWriter(fwriter); Iterator<String> set_iterator = atccode_set.iterator(); while (set_iterator.hasNext()) { bwriter.write(set_iterator.next() + "\n"); } bwriter.close(); } System.out.println(""); } catch (IOException e) { e.printStackTrace(); } }