1
2
3
4
5
6
7 package ch.colabproject.colab.api.controller.document;
8
9 import ch.colabproject.colab.api.controller.RequestManager;
10 import ch.colabproject.colab.api.rest.document.bean.UrlMetadata;
11 import java.io.ByteArrayOutputStream;
12 import java.io.IOException;
13 import java.net.URI;
14 import java.net.URLDecoder;
15 import java.nio.charset.StandardCharsets;
16 import java.time.OffsetDateTime;
17 import java.util.HashMap;
18 import java.util.Iterator;
19 import javax.cache.Cache;
20 import javax.ejb.LocalBean;
21 import javax.ejb.Stateless;
22 import javax.inject.Inject;
23 import org.apache.commons.lang3.StringUtils;
24 import org.apache.hc.client5.http.classic.methods.HttpGet;
25 import org.apache.hc.client5.http.impl.classic.HttpClients;
26 import org.apache.hc.core5.http.Header;
27 import org.apache.hc.core5.http.HttpEntity;
28 import org.apache.hc.core5.net.URIBuilder;
29 import org.jsoup.Jsoup;
30 import org.jsoup.nodes.Document;
31 import org.jsoup.select.Elements;
32 import org.slf4j.Logger;
33 import org.slf4j.LoggerFactory;
34
35
36
37
38
39
40 @Stateless
41 @LocalBean
42 public class ExternalDataManager {
43
44
45 private static final int CACHE_TTL_HOUR = 24;
46
47
48 private static final Logger logger = LoggerFactory.getLogger(UrlMetadata.class);
49
50
51 private static final String OG_TITLE = "og:title";
52
53
54 private static final String OG_URL = "og:url";
55
56
57 private static final String OG_IMAGE = "og:image";
58
59
60
61
62 @Inject
63 private Cache<String, UrlMetadata> metadataCache;
64
65
66 @Inject
67 private RequestManager requestManager;
68
69
70
71
72
73
74
75
76
77
78 private static String getEntityAsString(HttpEntity entity) throws IOException {
79 if (entity != null) {
80 ByteArrayOutputStream baos = new ByteArrayOutputStream();
81 entity.writeTo(baos);
82 return baos.toString("UTF-8");
83 } else {
84 return "";
85 }
86 }
87
88
89
90
91
92
93
94
95 private boolean isOutdated(UrlMetadata data) {
96 OffsetDateTime date = data.getDate();
97 if (date != null) {
98 OffsetDateTime endOfLife = date.plusHours(CACHE_TTL_HOUR);
99 if (endOfLife.isAfter(OffsetDateTime.now())) {
100 return false;
101 }
102 }
103
104 return true;
105 }
106
107
108
109
110
111
112
113
114 public UrlMetadata getUrlMetadata(String url) {
115 try {
116 UrlMetadata cached = metadataCache.get(url);
117 if (cached != null && !isOutdated(cached)) {
118 logger.trace("Get {} from cache", url);
119 return cached;
120 }
121 } catch (Throwable t) {
122 logger.trace("Failed to fetch {} from cache {}", url, t);
123 metadataCache.remove(url);
124 }
125 return this.refreshAndGetUrlMetadata(url);
126 }
127
128
129
130
131
132
133
134
135
136 private String sanitizeUrl(String rawUrl, String defaultProtocol) {
137 if (!rawUrl.matches("[a-z-A-Z0-9]*://.*")) {
138
139 if (StringUtils.isEmpty(defaultProtocol)) {
140 return "http://" + rawUrl;
141 } else {
142 return defaultProtocol + "://" + rawUrl;
143 }
144 }
145 return rawUrl;
146 }
147
148
149
150
151
152
153
154
155 public UrlMetadata refreshAndGetUrlMetadata(String url) {
156
157 UrlMetadata urlMetadata = new UrlMetadata();
158 urlMetadata.setBroken(true);
159 HashMap<String, String> metadata = new HashMap<>();
160 urlMetadata.setMetadata(metadata);
161
162 String decoded = URLDecoder.decode(url, StandardCharsets.UTF_8);
163
164
165 String baseUrl = requestManager.getBaseUrl();
166 if (decoded.startsWith(baseUrl)) {
167 logger.trace("Loopback url intercepted");
168 urlMetadata.setBroken(false);
169 metadata.put(OG_IMAGE, baseUrl + "/favicon_128.png");
170 metadata.put(OG_URL, decoded);
171 } else {
172
173 logger.trace("Raw URL {}", url);
174 try (var client = HttpClients.createDefault()) {
175 String sanitizedUrl = sanitizeUrl(url, null);
176
177 URIBuilder uriBuilder = new URIBuilder(sanitizedUrl, StandardCharsets.UTF_8);
178
179 URI uri = uriBuilder.normalizeSyntax().build();
180 metadata.put(OG_URL, url);
181
182 String[] segs = uri.getPath().split("/");
183 if (segs != null && segs.length > 0) {
184
185 String filename = segs[segs.length - 1];
186 metadata.put(OG_TITLE, filename);
187 } else {
188
189 metadata.put(OG_TITLE, uri.getHost());
190 }
191
192 var get = new HttpGet(uri);
193 try (var response = client.execute(get)) {
194
195 HttpEntity entity = response.getEntity();
196 int statusCode = response.getCode();
197
198 if (statusCode < 400) {
199
200 urlMetadata.setBroken(false);
201
202 Header firstHeader = response.getFirstHeader("content-type");
203 String contentType = firstHeader.getValue();
204 int separator = contentType.indexOf(';');
205
206 if (separator > 0) {
207 contentType = contentType.substring(0, separator);
208 }
209
210 if (contentType != null) {
211 urlMetadata.setContentType(contentType);
212 if (contentType.equals("text/html")) {
213
214 String html = getEntityAsString(entity);
215 Document htmlDocument = Jsoup.parse(html, url);
216 Elements metas = htmlDocument.head().select("meta");
217 metas.forEach(meta -> {
218 String prop = meta.attr("property");
219 String name = meta.attr("name");
220 if (prop != null && prop.indexOf(':') >= 0
221 || name != null && name.indexOf(':') >= 0) {
222 metadata.put(prop, meta.attr("content"));
223 }
224 });
225 }
226 }
227 }
228
229 }
230 } catch (Exception e) {
231 logger.debug("Major Failure", e);
232 urlMetadata.setBroken(true);
233 }
234 }
235 urlMetadata.setDate(OffsetDateTime.now());
236
237 metadataCache.put(url, urlMetadata);
238 return urlMetadata;
239 }
240
241
242
243
244 public void clearOutdated() {
245 Iterator<Cache.Entry<String, UrlMetadata>> iterator = metadataCache.iterator();
246 while (iterator.hasNext()) {
247 Cache.Entry<String, UrlMetadata> entry = iterator.next();
248 UrlMetadata data = entry.getValue();
249 if (isOutdated(data)) {
250 iterator.remove();
251 }
252 }
253 }
254
255 }