ExternalDataManager.java
/*
* The coLAB project
* Copyright (C) 2022-2023 AlbaSim, MEI, HEIG-VD, HES-SO
*
* Licensed under the MIT License
*/
package ch.colabproject.colab.api.controller.document;
import ch.colabproject.colab.api.controller.RequestManager;
import ch.colabproject.colab.api.rest.document.bean.UrlMetadata;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.HashMap;
import java.util.Iterator;
import javax.cache.Cache;
import javax.ejb.LocalBean;
import javax.ejb.Stateless;
import javax.inject.Inject;
import org.apache.commons.lang3.StringUtils;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.Header;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.net.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* To deal with external data
*
* @author maxence
*/
@Stateless
@LocalBean
public class ExternalDataManager {
/** duration an entry may stay in cache before being drop or refreshed */
private static final int CACHE_TTL_HOUR = 24;
/** Logger */
private static final Logger logger = LoggerFactory.getLogger(UrlMetadata.class);
/** Open graph title property */
private static final String OG_TITLE = "og:title";
/** Open graph title */
private static final String OG_URL = "og:url";
/** Open graph image */
private static final String OG_IMAGE = "og:image";
/**
* cache metadata to avoid spamming external services.
*/
@Inject
private Cache<String, UrlMetadata> metadataCache;
/** get the baseUrl of the application */
@Inject
private RequestManager requestManager;
/**
* Read response entity as stream
*
* @param entity http entity to read
*
* @return the string
*
* @throws IOException if something went wrong
*/
private static String getEntityAsString(HttpEntity entity) throws IOException {
if (entity != null) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
entity.writeTo(baos);
return baos.toString("UTF-8");
} else {
return "";
}
}
/**
* Is the given data outdated?
*
* @param data metadata to check
*
* @return true if data is outdated
*/
private boolean isOutdated(UrlMetadata data) {
OffsetDateTime date = data.getDate();
if (date != null) {
OffsetDateTime endOfLife = date.plusHours(CACHE_TTL_HOUR);
if (endOfLife.isAfter(OffsetDateTime.now())) {
return false;
}
}
return true;
}
/**
* Get cached Url metadata. if exists of build fresh
*
* @param url url to fetch metadata for
*
* @return url metadata
*/
public UrlMetadata getUrlMetadata(String url) {
try {
UrlMetadata cached = metadataCache.get(url);
if (cached != null && !isOutdated(cached)) {
logger.trace("Get {} from cache", url);
return cached;
}
} catch (Throwable t) {
logger.trace("Failed to fetch {} from cache {}", url, t);
metadataCache.remove(url);
}
return this.refreshAndGetUrlMetadata(url);
}
/**
* Make sure url starts with a protocol
*
* @param url to sanitize
* @param defaultProtocol default protocol to use. http is the default defaultProtocol
*
* @return url with protocol
*/
private String sanitizeUrl(String rawUrl, String defaultProtocol) {
if (!rawUrl.matches("[a-z-A-Z0-9]*://.*")) {
// There is no protocol, add default one
if (StringUtils.isEmpty(defaultProtocol)) {
return "http://" + rawUrl;
} else {
return defaultProtocol + "://" + rawUrl;
}
}
return rawUrl;
}
/**
* Update cache with fresh metadata
*
* @param url url to fetch metadata for
*
* @return url metadata
*/
public UrlMetadata refreshAndGetUrlMetadata(String url) {
UrlMetadata urlMetadata = new UrlMetadata();
urlMetadata.setBroken(true);
HashMap<String, String> metadata = new HashMap<>();
urlMetadata.setMetadata(metadata);
String decoded = URLDecoder.decode(url, StandardCharsets.UTF_8);
// hack: intercept loobpack link
String baseUrl = requestManager.getBaseUrl();
if (decoded.startsWith(baseUrl)) {
logger.trace("Loopback url intercepted");
urlMetadata.setBroken(false);
metadata.put(OG_IMAGE, baseUrl + "/favicon_128.png");
metadata.put(OG_URL, decoded);
} else {
logger.trace("Raw URL {}", url);
try (var client = HttpClients.createDefault()) {
String sanitizedUrl = sanitizeUrl(url, null);
URIBuilder uriBuilder = new URIBuilder(sanitizedUrl, StandardCharsets.UTF_8);
URI uri = uriBuilder.normalizeSyntax().build();
metadata.put(OG_URL, url);
String[] segs = uri.getPath().split("/");
if (segs != null && segs.length > 0) {
// default og:name to last path segment
String filename = segs[segs.length - 1];
metadata.put(OG_TITLE, filename);
} else {
// otherwise, default to hostname
metadata.put(OG_TITLE, uri.getHost());
}
var get = new HttpGet(uri);
try (var response = client.execute(get)) {
HttpEntity entity = response.getEntity();
int statusCode = response.getCode();
if (statusCode < 400) {
// success
urlMetadata.setBroken(false);
Header firstHeader = response.getFirstHeader("content-type");
String contentType = firstHeader.getValue();
int separator = contentType.indexOf(';');
if (separator > 0) {
contentType = contentType.substring(0, separator);
}
if (contentType != null) {
urlMetadata.setContentType(contentType);
if (contentType.equals("text/html")) {
// try to fetch metadata in head meta tags
String html = getEntityAsString(entity);
Document htmlDocument = Jsoup.parse(html, url);
Elements metas = htmlDocument.head().select("meta");
metas.forEach(meta -> {
String prop = meta.attr("property");
String name = meta.attr("name");
if (prop != null && prop.indexOf(':') >= 0
|| name != null && name.indexOf(':') >= 0) {
metadata.put(prop, meta.attr("content"));
}
});
}
}
}
}
} catch (Exception e) {
logger.debug("Major Failure", e);
urlMetadata.setBroken(true);
}
}
urlMetadata.setDate(OffsetDateTime.now());
// cache metadata
metadataCache.put(url, urlMetadata);
return urlMetadata;
}
/**
* Drop outdated entries from cache
*/
public void clearOutdated() {
Iterator<Cache.Entry<String, UrlMetadata>> iterator = metadataCache.iterator();
while (iterator.hasNext()) {
Cache.Entry<String, UrlMetadata> entry = iterator.next();
UrlMetadata data = entry.getValue();
if (isOutdated(data)) {
iterator.remove();
}
}
}
}