Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 46%
214 statements
« prev ^ index » next coverage.py v7.6.3, created at 2024-10-16 22:16 +0000
« prev ^ index » next coverage.py v7.6.3, created at 2024-10-16 22:16 +0000
1"""
2The `text` module contains the `Textbone` and a custom HTML-Parser
3to validate and extract client data for the `TextBone`.
4"""
5import string
6import warnings
7from base64 import urlsafe_b64decode
8from datetime import datetime
9from html import entities as htmlentitydefs
10from html.parser import HTMLParser
11import typing as t
13from viur.core import db, conf
14from viur.core.bones.base import BaseBone, ReadFromClientError, ReadFromClientErrorSeverity
16_defaultTags = {
17 "validTags": [ # List of HTML-Tags which are valid
18 'b', 'a', 'i', 'u', 'span', 'div', 'p', 'img', 'ol', 'ul', 'li', 'abbr', 'sub', 'sup',
19 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th', 'br',
20 'hr', 'strong', 'blockquote', 'em'],
21 "validAttrs": { # Mapping of valid parameters for each tag (if a tag is not listed here: no parameters allowed)
22 "a": ["href", "target", "title"],
23 "abbr": ["title"],
24 "span": ["title"],
25 "img": ["src", "alt", "title"], # "srcset" must not be in this list. It will be injected by ViUR
26 "td": ["colspan", "rowspan"],
27 "p": ["data-indent"],
28 "blockquote": ["cite"]
29 },
30 "validStyles": [
31 "color"
32 ], # List of CSS-Directives we allow
33 "validClasses": ["vitxt-*", "viur-txt-*"], # List of valid class-names that are valid
34 "singleTags": ["br", "img", "hr"] # List of tags, which don't have a corresponding end tag
35}
36"""
37A dictionary containing default configurations for handling HTML content in TextBone instances.
39- validTags (list[str]):
40 A list of valid HTML tags allowed in TextBone instances.
41- validAttrs (dict[str, list[str]]):
42 A dictionary mapping valid attributes for each tag. If a tag is not listed, no attributes are allowed for that tag.
43- validStyles (list[str]):
44 A list of allowed CSS directives for the TextBone instances.
45- validClasses (list[str]):
46 A list of valid CSS class names allowed in TextBone instances.
47- singleTags (list[str]):
48 A list of self-closing HTML tags that don't have corresponding end tags.
49"""
52class CollectBlobKeys(HTMLParser):
53 """
54 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute
55 of <a> and <img> tags.
56 """
58 def __init__(self):
59 super(CollectBlobKeys, self).__init__()
60 self.blobs = set()
62 def handle_starttag(self, tag, attrs):
63 """
64 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method
65 extracts the blob key from the "src" attribute and adds it to the "blobs" set.
67 :param str tag: The current start tag encountered by the parser.
68 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
69 """
70 if tag in ["a", "img"]:
71 for k, v in attrs:
72 if k == "src":
73 file = getattr(conf.main_app.vi, "file", None)
74 if file and (filepath := file.parse_download_url(v)):
75 self.blobs.add(filepath.dlkey)
78class HtmlSerializer(HTMLParser): # html.parser.HTMLParser
79 """
80 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content
81 by removing invalid tags and attributes while retaining the valid ones.
83 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes.
84 :param dict srcSet: A dictionary containing width and height for srcset attribute processing.
85 """
86 __html_serializer_trans = str.maketrans(
87 {"<": "<",
88 ">": ">",
89 "\"": """,
90 "'": "'",
91 "\n": "",
92 "\0": ""})
94 def __init__(self, validHtml=None, srcSet=None, convert_charrefs: bool = True):
95 super().__init__(convert_charrefs=convert_charrefs)
96 self.result = "" # The final result that will be returned
97 self.openTagsList = [] # List of tags that still need to be closed
98 self.tagCache = [] # Tuple of tags that have been processed but not written yet
99 self.validHtml = validHtml
100 self.srcSet = srcSet
102 def handle_data(self, data):
103 """
104 Handles the data encountered in the HTML content being parsed. Escapes special characters
105 and appends the data to the result if it is not only whitespace characters.
107 :param str data: The data encountered by the parser.
108 """
109 data = str(data).translate(HtmlSerializer.__html_serializer_trans)
110 if data.strip():
111 self.flushCache()
112 self.result += data
114 def handle_charref(self, name):
115 """
116 Handles character references in the HTML content being parsed and appends the character reference to the
117 result.
119 :param str name: The name of the character reference.
120 """
121 self.flushCache()
122 self.result += f"&#{name};"
124 def handle_entityref(self, name): # FIXME
125 """
126 Handles entity references in the HTML content being parsed and appends the entity reference to the result.
128 :param str name: The name of the entity reference.
129 """
130 if name in htmlentitydefs.entitydefs.keys(): 130 ↛ exitline 130 didn't return from function 'handle_entityref' because the condition on line 130 was always true
131 self.flushCache()
132 self.result += f"&{name};"
134 def flushCache(self):
135 """
136 Flush pending tags into the result and push their corresponding end-tags onto the stack
137 """
138 for start, end in self.tagCache:
139 self.result += start
140 self.openTagsList.insert(0, end)
141 self.tagCache = []
143 def handle_starttag(self, tag, attrs):
144 """
145 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and
146 processes valid ones.
148 :param str tag: The current start tag encountered by the parser.
149 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
150 """
151 filterChars = "\"'\\\0\r\n@()"
152 if self.validHtml and tag in self.validHtml["validTags"]:
153 cacheTagStart = '<' + tag
154 isBlankTarget = False
155 styles = None
156 classes = None
157 for k, v in attrs:
158 k = k.strip()
159 v = v.strip()
160 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):
161 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was never true
162 # If we have a title or href attribute, ignore @ and ()
163 pass
164 else:
165 # Either the key or the value contains a character that's not supposed to be there
166 continue
167 elif k == "class": 167 ↛ 169line 167 didn't jump to line 169 because the condition on line 167 was never true
168 # Classes are handled below
169 classes = v.split(" ")
170 continue
171 elif k == "style": 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was never true
172 # Styles are handled below
173 styles = v.split(";")
174 continue
175 elif k == "src": 175 ↛ 200line 175 didn't jump to line 200 because the condition on line 175 was always true
176 # We ensure that any src tag starts with an actual url
177 checker = v.lower()
178 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 continue
181 file = getattr(conf.main_app.vi, "file", None)
182 if file and (filepath := file.parse_download_url(v)): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 v = file.create_download_url(
184 filepath.dlkey,
185 filepath.filename,
186 filepath.is_derived,
187 expires=None
188 )
190 if self.srcSet:
191 # Build the src set with files already available. If a derived file is not yet build,
192 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards.
193 srcSet = file.create_src_set(
194 filepath.dlkey,
195 None,
196 self.srcSet.get("width"),
197 self.srcSet.get("height")
198 )
199 cacheTagStart += f' srcSet="{srcSet}"'
200 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was never true
201 # That attribute is not valid on this tag
202 continue
203 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 203 ↛ 205line 203 didn't jump to line 205 because the condition on line 203 was always true
204 cacheTagStart += f' {k}="{v}"'
205 if tag == "a" and k == "target" and v.lower() == "_blank": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 isBlankTarget = True
207 if styles: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true
208 syleRes = {}
209 for s in styles:
210 style = s[: s.find(":")].strip()
211 value = s[s.find(":") + 1:].strip()
212 if any([c in style for c in filterChars]) or any(
213 [c in value for c in filterChars]):
214 # Either the key or the value contains a character that's not supposed to be there
215 continue
216 if value.lower().startswith("expression") or value.lower().startswith("import"):
217 # IE evaluates JS inside styles if the keyword expression is present
218 continue
219 if style in self.validHtml["validStyles"] and not any(
220 [(x in value) for x in ["\"", ":", ";"]]):
221 syleRes[style] = value
222 if len(syleRes.keys()):
223 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\""""
224 if classes: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 validClasses = []
226 for currentClass in classes:
227 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"
228 if not all([x in validClassChars for x in currentClass]):
229 # The class contains invalid characters
230 continue
231 isOkay = False
232 for validClass in self.validHtml["validClasses"]:
233 # Check if the classname matches or is white-listed by a prefix
234 if validClass == currentClass:
235 isOkay = True
236 break
237 if validClass.endswith("*"):
238 validClass = validClass[:-1]
239 if currentClass.startswith(validClass):
240 isOkay = True
241 break
242 if isOkay:
243 validClasses.append(currentClass)
244 if validClasses:
245 cacheTagStart += f""" class=\"{" ".join(validClasses)}\""""
246 if isBlankTarget: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was never true
247 # Add rel tag to prevent the browser to pass window.opener around
248 cacheTagStart += " rel=\"noopener noreferrer\""
249 if tag in self.validHtml["singleTags"]:
250 # Single-Tags do have a visual representation; ensure it makes it into the result
251 self.flushCache()
252 self.result += cacheTagStart + '>' # dont need slash in void elements in html5
253 else:
254 # We opened a 'normal' tag; push it on the cache so it can be discarded later if
255 # we detect it has no content
256 cacheTagStart += '>'
257 self.tagCache.append((cacheTagStart, tag))
258 else:
259 self.result += " "
261 def handle_endtag(self, tag):
262 """
263 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones.
265 :param str tag: The current end tag encountered by the parser.
266 """
267 if self.validHtml: 267 ↛ exitline 267 didn't return from function 'handle_endtag' because the condition on line 267 was always true
268 if self.tagCache: 268 ↛ 271line 268 didn't jump to line 271 because the condition on line 268 was never true
269 # Check if that element is still on the cache
270 # and just silently drop the cache up to that point
271 if tag in [x[1] for x in self.tagCache] + self.openTagsList:
272 for tagCache in self.tagCache[::-1]:
273 self.tagCache.remove(tagCache)
274 if tagCache[1] == tag:
275 return
276 if tag in self.openTagsList:
277 # Close all currently open Tags until we reach the current one. If no one is found,
278 # we just close everything and ignore the tag that should have been closed
279 for endTag in self.openTagsList[:]: 279 ↛ exitline 279 didn't return from function 'handle_endtag' because the loop on line 279 didn't complete
280 self.result += f"</{endTag}>"
281 self.openTagsList.remove(endTag)
282 if endTag == tag:
283 break
285 def cleanup(self): # FIXME: vertauschte tags
286 """ Append missing closing tags to the result."""
287 self.flushCache()
288 for tag in self.openTagsList: 288 ↛ 289line 288 didn't jump to line 289 because the loop on line 288 never started
289 endTag = f'</{tag}>'
290 self.result += endTag
292 def sanitize(self, instr):
293 """
294 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones.
296 :param str instr: The input HTML string to be sanitized.
297 :return: The sanitized HTML string.
298 :rtype: str
299 """
300 self.result = ""
301 self.openTagsList = []
302 self.feed(instr)
303 self.close()
304 self.cleanup()
305 return self.result
308class TextBone(BaseBone):
309 """
310 A bone for storing and validating HTML or plain text content. Can be configured to allow
311 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of
312 srcset for embedded images.
314 :param Union[None, Dict] validHtml: A dictionary containing allowed HTML tags and their attributes. Defaults
315 to _defaultTags. Must be a structured like :prop:_defaultTags
316 :param int max_length: The maximum allowed length for the content. Defaults to 200000.
317 :param languages: If set, this bone can store a different content for each language
318 :param Dict[str, List] srcSet: An optional dictionary containing width and height for srcset generation.
319 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
320 :param bool indexed: Whether the content should be indexed for searching. Defaults to False.
321 :param kwargs: Additional keyword arguments to be passed to the base class constructor.
322 """
324 class __undefinedC__:
325 pass
327 type = "text"
329 def __init__(
330 self,
331 *,
332 validHtml: None | dict = __undefinedC__,
333 max_length: int = 200000,
334 srcSet: t.Optional[dict[str, list]] = None,
335 indexed: bool = False,
336 **kwargs
337 ):
338 """
339 :param validHtml: If set, must be a structure like :prop:_defaultTags
340 :param languages: If set, this bone can store a different content for each language
341 :param max_length: Limit content to max_length bytes
342 :param indexed: Must not be set True, unless you limit max_length accordingly
343 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of
344 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
345 """
346 # fixme: Remove in viur-core >= 4
347 if "maxLength" in kwargs: 347 ↛ 348line 347 didn't jump to line 348 because the condition on line 347 was never true
348 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning)
349 max_length = kwargs.pop("maxLength")
350 super().__init__(indexed=indexed, **kwargs)
352 if validHtml == TextBone.__undefinedC__: 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was always true
353 global _defaultTags
354 validHtml = _defaultTags
356 self.validHtml = validHtml
357 self.max_length = max_length
358 self.srcSet = srcSet
360 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool):
361 """
362 Serializes a single value of the TextBone instance for storage.
364 This method takes the value as-is without any additional processing, since it's already stored in a format
365 suitable for serialization.
366 """
367 return value
369 def singleValueFromClient(self, value, skel, bone_name, client_data):
370 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise
371 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None
372 else:
373 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)]
375 def getEmptyValue(self):
376 """
377 Returns an empty value for the TextBone instance.
379 This method is used to represent an empty or unset value for the TextBone.
381 return: An empty string.
382 :rtype: str
383 """
384 return ""
386 def isInvalid(self, value):
387 """
388 Checks if the given value is valid for this TextBone instance.
390 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not
391 None and within the maximum length).
393 :param value: The value to be checked for validity.
394 :return: Returns None if the value is valid, or an error message string otherwise.
395 :rtype: Optional[str]
396 """
398 if value == None:
399 return "No value entered"
400 if len(value) > self.max_length: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 return "Maximum length exceeded"
403 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:
404 """
405 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance.
407 This method parses the HTML content of the TextBone to identify embedded images or file hrefs,
408 collects their blob keys, and ensures that they are not deleted even if removed from the file browser,
409 preventing broken links or images in the TextBone content.
411 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
412 :param str name: The name of the TextBone for which to find referenced blobs.
413 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content.
414 :rtype: Set[str]
415 """
417 collector = CollectBlobKeys()
419 for idx, lang, value in self.iter_bone_value(skel, name):
420 if value:
421 collector.feed(value)
423 blob_keys = collector.blobs
425 if blob_keys and self.srcSet:
426 derive_dict = {
427 "thumbnail": [
428 {"width": x} for x in (self.srcSet.get("width") or [])
429 ] + [
430 {"height": x} for x in (self.srcSet.get("height") or [])
431 ]
432 }
433 from viur.core.bones.file import ensureDerived
434 for blob_key in blob_keys:
435 file_obj = db.Query("file").filter("dlkey =", blob_key) \
436 .order(("creationdate", db.SortOrder.Ascending)).getEntry()
437 if file_obj:
438 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"])
440 return blob_keys
442 def refresh(self, skel, boneName) -> None:
443 """
444 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary.
446 This method is useful when the src-set configuration has changed and needs to be applied
447 to the existing HTML content. It re-parses the content and updates the src-set attributes
448 accordingly.
450 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
451 :param str boneName: The name of the TextBone for which to refresh the src-set.
452 """
453 if self.srcSet:
454 val = skel[boneName]
455 if self.languages and isinstance(val, dict):
456 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()}
457 elif not self.languages and isinstance(val, str):
458 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0]
460 def getSearchTags(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:
461 """
462 Extracts search tags from the text content of a TextBone.
464 This method iterates over the values of the TextBone in the given skeleton, and for each non-empty value,
465 it tokenizes the text by lines and words. Then, it adds the lowercase version of each word to a set of
466 search tags, which is returned at the end.
468 :param skel: A SkeletonInstance containing the TextBone.
469 :param name: The name of the TextBone in the skeleton.
470 :return: A set of unique search tags (lowercase words) extracted from the text content of the TextBone.
471 """
472 result = set()
473 for idx, lang, value in self.iter_bone_value(skel, name):
474 if value is None:
475 continue
476 for line in str(value).splitlines():
477 for word in line.split(" "):
478 result.add(word.lower())
479 return result
481 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]:
482 """
483 Retrieves the unique property index values for the TextBone.
485 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear
486 whether each language should be kept distinct or not. Otherwise, it calls the superclass's
487 getUniquePropertyIndexValues method to retrieve the unique property index values.
489 :param valuesCache: A dictionary containing the cached values for the TextBone.
490 :param name: The name of the TextBone.
491 :return: A list of unique property index values for the TextBone.
492 :raises NotImplementedError: If the TextBone supports multiple languages.
493 """
494 if self.languages:
495 # Not yet implemented as it's unclear if we should keep each language distinct or not
496 raise NotImplementedError()
498 return super().getUniquePropertyIndexValues(valuesCache, name)
500 def structure(self) -> dict:
501 return super().structure() | {
502 "valid_html": self.validHtml,
503 }