ExternalDataManager.java
- /*
- * The coLAB project
- * Copyright (C) 2022-2023 AlbaSim, MEI, HEIG-VD, HES-SO
- *
- * Licensed under the MIT License
- */
- package ch.colabproject.colab.api.controller.document;
- import ch.colabproject.colab.api.controller.RequestManager;
- import ch.colabproject.colab.api.rest.document.bean.UrlMetadata;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.net.URI;
- import java.net.URLDecoder;
- import java.nio.charset.StandardCharsets;
- import java.time.OffsetDateTime;
- import java.util.HashMap;
- import java.util.Iterator;
- import javax.cache.Cache;
- import javax.ejb.LocalBean;
- import javax.ejb.Stateless;
- import javax.inject.Inject;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.hc.client5.http.classic.methods.HttpGet;
- import org.apache.hc.client5.http.impl.classic.HttpClients;
- import org.apache.hc.core5.http.Header;
- import org.apache.hc.core5.http.HttpEntity;
- import org.apache.hc.core5.net.URIBuilder;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.select.Elements;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * To deal with external data
- *
- * @author maxence
- */
- @Stateless
- @LocalBean
- public class ExternalDataManager {
- /** duration an entry may stay in cache before being drop or refreshed */
- private static final int CACHE_TTL_HOUR = 24;
- /** Logger */
- private static final Logger logger = LoggerFactory.getLogger(UrlMetadata.class);
- /** Open graph title property */
- private static final String OG_TITLE = "og:title";
- /** Open graph title */
- private static final String OG_URL = "og:url";
- /** Open graph image */
- private static final String OG_IMAGE = "og:image";
- /**
- * cache metadata to avoid spamming external services.
- */
- @Inject
- private Cache<String, UrlMetadata> metadataCache;
- /** get the baseUrl of the application */
- @Inject
- private RequestManager requestManager;
- /**
- * Read response entity as stream
- *
- * @param entity http entity to read
- *
- * @return the string
- *
- * @throws IOException if something went wrong
- */
- private static String getEntityAsString(HttpEntity entity) throws IOException {
- if (entity != null) {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- entity.writeTo(baos);
- return baos.toString("UTF-8");
- } else {
- return "";
- }
- }
- /**
- * Is the given data outdated?
- *
- * @param data metadata to check
- *
- * @return true if data is outdated
- */
- private boolean isOutdated(UrlMetadata data) {
- OffsetDateTime date = data.getDate();
- if (date != null) {
- OffsetDateTime endOfLife = date.plusHours(CACHE_TTL_HOUR);
- if (endOfLife.isAfter(OffsetDateTime.now())) {
- return false;
- }
- }
- return true;
- }
- /**
- * Get cached Url metadata. if exists of build fresh
- *
- * @param url url to fetch metadata for
- *
- * @return url metadata
- */
- public UrlMetadata getUrlMetadata(String url) {
- try {
- UrlMetadata cached = metadataCache.get(url);
- if (cached != null && !isOutdated(cached)) {
- logger.trace("Get {} from cache", url);
- return cached;
- }
- } catch (Throwable t) {
- logger.trace("Failed to fetch {} from cache {}", url, t);
- metadataCache.remove(url);
- }
- return this.refreshAndGetUrlMetadata(url);
- }
- /**
- * Make sure url starts with a protocol
- *
- * @param url to sanitize
- * @param defaultProtocol default protocol to use. http is the default defaultProtocol
- *
- * @return url with protocol
- */
- private String sanitizeUrl(String rawUrl, String defaultProtocol) {
- if (!rawUrl.matches("[a-z-A-Z0-9]*://.*")) {
- // There is no protocol, add default one
- if (StringUtils.isEmpty(defaultProtocol)) {
- return "http://" + rawUrl;
- } else {
- return defaultProtocol + "://" + rawUrl;
- }
- }
- return rawUrl;
- }
- /**
- * Update cache with fresh metadata
- *
- * @param url url to fetch metadata for
- *
- * @return url metadata
- */
- public UrlMetadata refreshAndGetUrlMetadata(String url) {
- UrlMetadata urlMetadata = new UrlMetadata();
- urlMetadata.setBroken(true);
- HashMap<String, String> metadata = new HashMap<>();
- urlMetadata.setMetadata(metadata);
- String decoded = URLDecoder.decode(url, StandardCharsets.UTF_8);
- // hack: intercept loobpack link
- String baseUrl = requestManager.getBaseUrl();
- if (decoded.startsWith(baseUrl)) {
- logger.trace("Loopback url intercepted");
- urlMetadata.setBroken(false);
- metadata.put(OG_IMAGE, baseUrl + "/favicon_128.png");
- metadata.put(OG_URL, decoded);
- } else {
- logger.trace("Raw URL {}", url);
- try (var client = HttpClients.createDefault()) {
- String sanitizedUrl = sanitizeUrl(url, null);
- URIBuilder uriBuilder = new URIBuilder(sanitizedUrl, StandardCharsets.UTF_8);
- URI uri = uriBuilder.normalizeSyntax().build();
- metadata.put(OG_URL, url);
- String[] segs = uri.getPath().split("/");
- if (segs != null && segs.length > 0) {
- // default og:name to last path segment
- String filename = segs[segs.length - 1];
- metadata.put(OG_TITLE, filename);
- } else {
- // otherwise, default to hostname
- metadata.put(OG_TITLE, uri.getHost());
- }
- var get = new HttpGet(uri);
- try (var response = client.execute(get)) {
- HttpEntity entity = response.getEntity();
- int statusCode = response.getCode();
- if (statusCode < 400) {
- // success
- urlMetadata.setBroken(false);
- Header firstHeader = response.getFirstHeader("content-type");
- String contentType = firstHeader.getValue();
- int separator = contentType.indexOf(';');
- if (separator > 0) {
- contentType = contentType.substring(0, separator);
- }
- if (contentType != null) {
- urlMetadata.setContentType(contentType);
- if (contentType.equals("text/html")) {
- // try to fetch metadata in head meta tags
- String html = getEntityAsString(entity);
- Document htmlDocument = Jsoup.parse(html, url);
- Elements metas = htmlDocument.head().select("meta");
- metas.forEach(meta -> {
- String prop = meta.attr("property");
- String name = meta.attr("name");
- if (prop != null && prop.indexOf(':') >= 0
- || name != null && name.indexOf(':') >= 0) {
- metadata.put(prop, meta.attr("content"));
- }
- });
- }
- }
- }
- }
- } catch (Exception e) {
- logger.debug("Major Failure", e);
- urlMetadata.setBroken(true);
- }
- }
- urlMetadata.setDate(OffsetDateTime.now());
- // cache metadata
- metadataCache.put(url, urlMetadata);
- return urlMetadata;
- }
- /**
- * Drop outdated entries from cache
- */
- public void clearOutdated() {
- Iterator<Cache.Entry<String, UrlMetadata>> iterator = metadataCache.iterator();
- while (iterator.hasNext()) {
- Cache.Entry<String, UrlMetadata> entry = iterator.next();
- UrlMetadata data = entry.getValue();
- if (isOutdated(data)) {
- iterator.remove();
- }
- }
- }
- }