HtmlUtils.java
package fr.sii.ogham.core.util;
import static java.util.Arrays.asList;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringJoiner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utility class for handling HTML content. It helps for repetitive tasks for
* manipulating HTML.
*
* @author Aurélien Baudet
*
*/
public final class HtmlUtils {
private static final Logger LOG = LoggerFactory.getLogger(HtmlUtils.class);
private static final Pattern HTML_PATTERN = Pattern.compile("<html", Pattern.CASE_INSENSITIVE);
private static final String CSS_LINKS_SELECTOR = "link[rel*=\"stylesheet\"], link[type=\"text/css\"], link[href$=\".css\"]";
private static final String HREF_ATTR = "href";
private static final String IMG_SELECTOR = "img";
private static final String SRC_ATTR = "src";
private static final Pattern URL_PATTERN = Pattern.compile("^https?://.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern URI_INVALID_CHARS = Pattern.compile("\\\\'");
private static final String URI_ESCAPE = "''";
private static final Pattern QUOTE_ENTITY = Pattern.compile(""");
private static final String UNQUOTED_FORM = "(?<startunquoted>\\s*url\\s*[(]\\s*)(?<urlunquoted>(?:\\\\[()\\s]|[^()\\s])+)(?<endunquoted>\\s*[)]\\s*(?:[\\s;,'\"]|$))";
private static final String QUOTED_FORM = "(?<start#QUOTENAME#>\\s*url\\s*[(]\\s*)(?<quote#QUOTENAME#>#QUOTE#)(?<url#QUOTENAME#>(?:\\\\#QUOTE#|(?!#QUOTE#).)+)#QUOTE#(?<end#QUOTENAME#>\\s*[)]\\s*(?:[\\s;,'\"]|$))";
/**
* Regular expression that matches CSS properties for image inclusions such
* as:
* <ul>
* <li>{@code background: <value>;}</li>
* <li>{@code background-image: <value>};</li>
* <li>{@code list-style: <value>};</li>
* <li>{@code list-style-image: <value>};</li>
* <li>{@code cursor: <value>};</li>
* </ul>
*
* <p>
* The pattern provides the following named capturing groups:
* <ul>
* <li>{@code "property"}: matches the property part (property name, spaces
* and {@literal :})</li>
* <li>{@code "propertyname"}: matches the property name (such as
* {@code background})</li>
* <li>{@code "value"}: matches the property value (without final
* {@literal ;})</li>
* </ul>
*/
public static final Pattern CSS_IMAGE_PROPERTIES_PATTERN = Pattern.compile("(?<property>(?<propertyname>((background|list-style)(-image)?)|cursor)\\s*:)(?<value>[^;}>]+)",
Pattern.MULTILINE | Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
/**
* Indicates if the provided content is HTML or not. It is considered HTML
* only if it is a whole document. Any partial HTML content won't be
* considered as HTML.
*
* @param content
* the content to test
* @return true if it is HTML, false otherwise
*/
public static boolean isHtml(String content) {
return HTML_PATTERN.matcher(content).find();
}
/**
* Finds all CSS file inclusions (looks for <code>link</code> tags for
* stylesheet files). Returns only the path or URL to the CSS file. If the
* several CSS inclusions have the same path, the path is present in the
* list only one time.
*
* @param htmlContent
* the html content that may contain external CSS files
* @return the list of found CSS inclusions (paths only) or empty if nothing
* found
*/
public static List<String> getDistinctCssUrls(String htmlContent) {
Document doc = Jsoup.parse(htmlContent);
Elements els = doc.select(CSS_LINKS_SELECTOR);
List<String> cssFiles = new ArrayList<>(els.size());
for (Element e : els) {
String path = e.attr(HREF_ATTR);
if (!cssFiles.contains(path)) {
cssFiles.add(path);
}
}
return cssFiles;
}
/**
* Finds all image inclusions (looks for <code>img</code> tags). Returns
* only the path or URL to the image. If the several images have the same
* path, the path is present in the list only one time.
*
* @param htmlContent
* the html content that may contain image files
* @return the list of found images (paths only) or empty if nothing found
*/
public static List<String> getDistinctImageUrls(String htmlContent) {
Document doc = Jsoup.parse(htmlContent);
Elements els = doc.select(IMG_SELECTOR);
List<String> images = new ArrayList<>(els.size());
for (Element e : els) {
String path = e.attr(SRC_ATTR);
if (!images.contains(path)) {
images.add(path);
}
}
return images;
}
/**
* Finds all image inclusions from CSS properties. Returns only the path or
* URL to the image. If the several images have the same path, the path is
* present in the list only one time.
*
* <p>
* It looks for:
* <ul>
* <li><code>background</code></li>
* <li><code>background-image</code></li>
* <li><code>list-style</code></li>
* <li><code>list-style-image</code></li>
* <li><code>cursor</code></li>
* </ul>
*
* @param htmlContent
* the html content that may contain image files
* @return the list of found images (paths only) or empty if nothing found
*/
public static List<String> getDistinctCssImageUrls(String htmlContent) {
List<String> urls = new ArrayList<>();
Matcher m = CSS_IMAGE_PROPERTIES_PATTERN.matcher(QUOTE_ENTITY.matcher(htmlContent).replaceAll("'"));
while (m.find()) {
for (CssUrlFunction url : getCssUrlFunctions(m.group("value"))) {
if (!urls.contains(url.getUrl())) {
urls.add(url.getUrl());
}
}
}
return urls;
}
/**
* Parse the CSS property value that may contain one or several
* {@code url()} CSS function(s).
*
* Each element of the returned list provides the following information:
* <ul>
* <li>{@code "source"}: the whole match of the {@code url()} function</li>
* <li>{@code "start"}: matches the {@code url(} part (without quote, spaces
* are preserved)</li>
* <li>{@code "end"}: matches the {@code )} part (without quote, spaces are
* preserved)</li>
* <li>{@code "url"}: the url (without surrounding quotes)</li>
* <li>{@code "enclosingQuoteChar"}: either {@literal "} character,
* {@literal '} character or empty string</li>
* </ul>
*
* <strong>WARNING:</strong> This function doesn't attempt to validate the
* URL at all. It just extracts the different parts for later parsing. If
* either the URL or CSS property value or the {@code url()} function is
* invalid, it may still return a value because it depends on the parsing
* context. It may then return an invalid form. For example
* {@code url('images/h'1.gif')} is not valid due to unscaped single quote,
* however this method will return a result with {@code images/h'1.gif} as
* URL.
*
* @param cssPropertyValue
* the value of the CSS property
* @param additionalEnclosingQuotes
* allow additional forms such as
* {@code url("http://some-url")} that may be used in
* style attribute
* @return the list of meta information about the matched urls
*/
public static List<CssUrlFunction> getCssUrlFunctions(String cssPropertyValue, String... additionalEnclosingQuotes) {
List<String> possibleQuotes = new ArrayList<>(asList("'", "\""));
possibleQuotes.addAll(asList(additionalEnclosingQuotes));
Pattern cssUrlFuncPattern = generateUrlFuncPattern(possibleQuotes);
List<CssUrlFunction> urls = new ArrayList<>();
Matcher urlMatcher = cssUrlFuncPattern.matcher(cssPropertyValue);
while (urlMatcher.find()) {
CssUrlFunction url = null;
for (int i = 0; i < possibleQuotes.size(); i++) {
if (urlMatcher.group("quotedform" + i) != null) {
url = new CssUrlFunction(urlMatcher.group("quotedform" + i), urlMatcher.group("start" + i), urlMatcher.group("url" + i), urlMatcher.group("end" + i), possibleQuotes.get(i));
break;
}
}
if (urlMatcher.group("unquotedform") != null) {
url = new CssUrlFunction(urlMatcher.group("unquotedform"), urlMatcher.group("startunquoted"), urlMatcher.group("urlunquoted"), urlMatcher.group("endunquoted"), "");
}
if (url != null) {
urls.add(url);
}
}
return urls;
}
/**
* Get the title of the HTML. If no <code>title</code> tag exists, then the
* title is null.
*
* @param htmlContent
* the HTML content that may contain a title
* @return the title of the HTML or null if none
*/
public static String getTitle(String htmlContent) {
Document doc = Jsoup.parse(htmlContent);
Elements titleNode = doc.select("head > title");
return titleNode.isEmpty() ? null : doc.title();
}
/**
* The list of provided URLs are either relative or absolute. This method
* returns only the list of relative URLs.
*
* <p>
* The URL is considered absolute if it starts with {@code "http://"} or
* {@code https://}.
*
*
* @param urls
* the urls (relative or absolute)
* @return the relative urls only
*/
public static List<String> skipExternalUrls(List<String> urls) {
for (Iterator<String> it = urls.iterator(); it.hasNext();) {
String url = it.next();
if (URL_PATTERN.matcher(url).matches()) {
it.remove();
}
}
return urls;
}
/**
* Generate a relative URL/path:
* <ul>
* <li>If {@code other} parameter is absolute, then return
* {@code other}.</li>
* <li>If {@code other} parameter is relative, then it merges {@code other}
* into {@code base}. For example:
* <ul>
* <li>base="css/foo.css", other="bar.png" {@literal =>} returns
* "css/bar.png"</li>
* <li>base="css/foo.css", other="../images/bar.png" {@literal =>} returns
* "images/bar.png"</li>
* <li>base="http://some-url/css/foo.css", other="bar.png" {@literal =>}
* returns "http://some-url/css/bar.png"</li>
* <li>base="http://some-url/css/foo.css", other="../images/bar.png"
* {@literal =>} returns "http://some-url/images/bar.png"</li>
* </ul>
* </li>
* </ul>
*
* <p>
* This method uses {@link #isRelativeUrl(String)} to determine if
* {@code other} is relative or absolute.
*
* @param base
* the base path/URL
* @param other
* the path/URL to relativize
* @return the merge path/URL
*/
public static String relativize(String base, String other) {
if (!isRelativeUrl(other)) {
return other;
}
Path basePath = Paths.get(base);
return unescapeJavaUri(ResourceUtils.toResourcePath(basePath.resolveSibling(escapeForJavaUri(other)).normalize()));
}
/**
* Indicates if the URL is relative or not.
*
* <p>
* Relative URLs may be:
* <ul>
* <li>{@code "relative/path"}</li>
* <li>{@code "./relative/path"}</li>
* <li>{@code "../relative/path"}</li>
* </ul>
*
* <p>
* On the contrary, any URL that matches one of the following condition is
* absolute:
* <ul>
* <li>starts with a scheme or protocol (like {@code "http://"} or
* {@code "classpath:"}</li>
* <li>starts with a {@code "/"}</li>
* </ul>
*
* @param url
* the URL that may be relative or absolute
* @return true if relative
*/
public static boolean isRelativeUrl(String url) {
try {
if (url.startsWith("/")) {
return false;
}
URI u = new URI(escapeForJavaUri(url));
return !u.isAbsolute();
} catch (URISyntaxException e) {
LOG.warn("Can't determine if '{}' url is relative or absolute => consider absolute", url);
LOG.trace("", e);
return false;
}
}
private static String escapeForJavaUri(String url) {
return URI_INVALID_CHARS.matcher(url).replaceAll(URI_ESCAPE);
}
@SuppressWarnings({ "java:S5361", "squid:S5361" })
private static String unescapeJavaUri(String url) {
return url.replaceAll(URI_ESCAPE, URI_INVALID_CHARS.pattern());
}
private static Pattern generateUrlFuncPattern(List<String> possibleQuotes) {
StringJoiner joiner = new StringJoiner("|");
int i = 0;
for (String possibleQuote : possibleQuotes) {
joiner.add("(?<quotedform" + i + ">" + QUOTED_FORM.replace("#QUOTE#", Pattern.quote(possibleQuote)).replace("#QUOTENAME#", i + "") + ")");
i++;
}
joiner.add("(?<unquotedform>" + UNQUOTED_FORM + ")");
return Pattern.compile(joiner.toString(), Pattern.MULTILINE);
}
private HtmlUtils() {
super();
}
}