View Javadoc
1   /*
2    * The coLAB project
3    * Copyright (C) 2022-2023 AlbaSim, MEI, HEIG-VD, HES-SO
4    *
5    * Licensed under the MIT License
6    */
7   package ch.colabproject.colab.api.controller.document;
8   
9   import ch.colabproject.colab.api.controller.RequestManager;
10  import ch.colabproject.colab.api.rest.document.bean.UrlMetadata;
11  import java.io.ByteArrayOutputStream;
12  import java.io.IOException;
13  import java.net.URI;
14  import java.net.URLDecoder;
15  import java.nio.charset.StandardCharsets;
16  import java.time.OffsetDateTime;
17  import java.util.HashMap;
18  import java.util.Iterator;
19  import javax.cache.Cache;
20  import javax.ejb.LocalBean;
21  import javax.ejb.Stateless;
22  import javax.inject.Inject;
23  import org.apache.commons.lang3.StringUtils;
24  import org.apache.hc.client5.http.classic.methods.HttpGet;
25  import org.apache.hc.client5.http.impl.classic.HttpClients;
26  import org.apache.hc.core5.http.Header;
27  import org.apache.hc.core5.http.HttpEntity;
28  import org.apache.hc.core5.net.URIBuilder;
29  import org.jsoup.Jsoup;
30  import org.jsoup.nodes.Document;
31  import org.jsoup.select.Elements;
32  import org.slf4j.Logger;
33  import org.slf4j.LoggerFactory;
34  
35  /**
36   * To deal with external data
37   *
38   * @author maxence
39   */
40  @Stateless
41  @LocalBean
42  public class ExternalDataManager {
43  
44      /** duration an entry may stay in cache before being drop or refreshed */
45      private static final int CACHE_TTL_HOUR = 24;
46  
47      /** Logger */
48      private static final Logger logger = LoggerFactory.getLogger(UrlMetadata.class);
49  
50      /** Open graph title property */
51      private static final String OG_TITLE = "og:title";
52  
53      /** Open graph title */
54      private static final String OG_URL = "og:url";
55  
56      /** Open graph image */
57      private static final String OG_IMAGE = "og:image";
58  
59      /**
60       * cache metadata to avoid spamming external services.
61       */
62      @Inject
63      private Cache<String, UrlMetadata> metadataCache;
64  
65      /** get the baseUrl of the application */
66      @Inject
67      private RequestManager requestManager;
68  
69      /**
70       * Read response entity as stream
71       *
72       * @param entity http entity to read
73       *
74       * @return the string
75       *
76       * @throws IOException if something went wrong
77       */
78      private static String getEntityAsString(HttpEntity entity) throws IOException {
79          if (entity != null) {
80              ByteArrayOutputStream baos = new ByteArrayOutputStream();
81              entity.writeTo(baos);
82              return baos.toString("UTF-8");
83          } else {
84              return "";
85          }
86      }
87  
88      /**
89       * Is the given data outdated?
90       *
91       * @param data metadata to check
92       *
93       * @return true if data is outdated
94       */
95      private boolean isOutdated(UrlMetadata data) {
96          OffsetDateTime date = data.getDate();
97          if (date != null) {
98              OffsetDateTime endOfLife = date.plusHours(CACHE_TTL_HOUR);
99              if (endOfLife.isAfter(OffsetDateTime.now())) {
100                 return false;
101             }
102         }
103 
104         return true;
105     }
106 
107     /**
108      * Get cached Url metadata. if exists of build fresh
109      *
110      * @param url url to fetch metadata for
111      *
112      * @return url metadata
113      */
114     public UrlMetadata getUrlMetadata(String url) {
115         try {
116             UrlMetadata cached = metadataCache.get(url);
117             if (cached != null && !isOutdated(cached)) {
118                 logger.trace("Get {} from cache", url);
119                 return cached;
120             }
121         } catch (Throwable t) {
122             logger.trace("Failed to fetch {} from cache {}", url, t);
123             metadataCache.remove(url);
124         }
125         return this.refreshAndGetUrlMetadata(url);
126     }
127 
128     /**
129      * Make sure url starts with a protocol
130      *
131      * @param url             to sanitize
132      * @param defaultProtocol default protocol to use. http is the default defaultProtocol
133      *
134      * @return url with protocol
135      */
136     private String sanitizeUrl(String rawUrl, String defaultProtocol) {
137         if (!rawUrl.matches("[a-z-A-Z0-9]*://.*")) {
138             // There is no protocol, add default one
139             if (StringUtils.isEmpty(defaultProtocol)) {
140                 return "http://" + rawUrl;
141             } else {
142                 return defaultProtocol + "://" + rawUrl;
143             }
144         }
145         return rawUrl;
146     }
147 
148     /**
149      * Update cache with fresh metadata
150      *
151      * @param url url to fetch metadata for
152      *
153      * @return url metadata
154      */
155     public UrlMetadata refreshAndGetUrlMetadata(String url) {
156 
157         UrlMetadata urlMetadata = new UrlMetadata();
158         urlMetadata.setBroken(true);
159         HashMap<String, String> metadata = new HashMap<>();
160         urlMetadata.setMetadata(metadata);
161 
162         String decoded = URLDecoder.decode(url, StandardCharsets.UTF_8);
163 
164         // hack: intercept loobpack link
165         String baseUrl = requestManager.getBaseUrl();
166         if (decoded.startsWith(baseUrl)) {
167             logger.trace("Loopback url intercepted");
168             urlMetadata.setBroken(false);
169             metadata.put(OG_IMAGE, baseUrl + "/favicon_128.png");
170             metadata.put(OG_URL, decoded);
171         } else {
172 
173             logger.trace("Raw URL {}", url);
174             try (var client = HttpClients.createDefault()) {
175                 String sanitizedUrl = sanitizeUrl(url, null);
176 
177                 URIBuilder uriBuilder = new URIBuilder(sanitizedUrl, StandardCharsets.UTF_8);
178 
179                 URI uri = uriBuilder.normalizeSyntax().build();
180                 metadata.put(OG_URL, url);
181 
182                 String[] segs = uri.getPath().split("/");
183                 if (segs != null && segs.length > 0) {
184                     // default og:name to last path segment
185                     String filename = segs[segs.length - 1];
186                     metadata.put(OG_TITLE, filename);
187                 } else {
188                     // otherwise, default to hostname
189                     metadata.put(OG_TITLE, uri.getHost());
190                 }
191 
192                 var get = new HttpGet(uri);
193                 try (var response = client.execute(get)) {
194 
195                     HttpEntity entity = response.getEntity();
196                     int statusCode = response.getCode();
197 
198                     if (statusCode < 400) {
199                         // success
200                         urlMetadata.setBroken(false);
201 
202                         Header firstHeader = response.getFirstHeader("content-type");
203                         String contentType = firstHeader.getValue();
204                         int separator = contentType.indexOf(';');
205 
206                         if (separator > 0) {
207                             contentType = contentType.substring(0, separator);
208                         }
209 
210                         if (contentType != null) {
211                             urlMetadata.setContentType(contentType);
212                             if (contentType.equals("text/html")) {
213                                 // try to fetch metadata in head meta tags
214                                 String html = getEntityAsString(entity);
215                                 Document htmlDocument = Jsoup.parse(html, url);
216                                 Elements metas = htmlDocument.head().select("meta");
217                                 metas.forEach(meta -> {
218                                     String prop = meta.attr("property");
219                                     String name = meta.attr("name");
220                                     if (prop != null && prop.indexOf(':') >= 0
221                                         || name != null && name.indexOf(':') >= 0) {
222                                         metadata.put(prop, meta.attr("content"));
223                                     }
224                                 });
225                             }
226                         }
227                     }
228 
229                 }
230             } catch (Exception e) {
231                 logger.debug("Major Failure", e);
232                 urlMetadata.setBroken(true);
233             }
234         }
235         urlMetadata.setDate(OffsetDateTime.now());
236         // cache metadata
237         metadataCache.put(url, urlMetadata);
238         return urlMetadata;
239     }
240 
241     /**
242      * Drop outdated entries from cache
243      */
244     public void clearOutdated() {
245         Iterator<Cache.Entry<String, UrlMetadata>> iterator = metadataCache.iterator();
246         while (iterator.hasNext()) {
247             Cache.Entry<String, UrlMetadata> entry = iterator.next();
248             UrlMetadata data = entry.getValue();
249             if (isOutdated(data)) {
250                 iterator.remove();
251             }
252         }
253     }
254 
255 }