Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 47%
221 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-02-07 19:28 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-02-07 19:28 +0000
1"""
2The `text` module contains the `Textbone` and a custom HTML-Parser
3to validate and extract client data for the `TextBone`.
4"""
5import html
6import string
7import typing as t
8import warnings
9from html.parser import HTMLParser
10from viur.core import db, conf
11from viur.core.bones.base import BaseBone, ReadFromClientError, ReadFromClientErrorSeverity
14class HtmlBoneConfiguration(t.TypedDict):
15 """A dictionary containing configurations for handling HTML content in TextBone instances."""
17 validTags: list[str]
18 """A list of valid HTML tags allowed in TextBone instances."""
20 validAttrs: dict[str, list[str]]
21 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes."""
23 validStyles: list[str]
24 """A list of allowed CSS directives for the TextBone instances."""
26 validClasses: list[str]
27 """A list of valid CSS class names allowed in TextBone instances."""
29 singleTags: list[str]
30 """A list of self-closing HTML tags that don't have corresponding end tags."""
33class CollectBlobKeys(HTMLParser):
34 """
35 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute
36 of <a> and <img> tags.
37 """
39 def __init__(self):
40 super(CollectBlobKeys, self).__init__()
41 self.blobs = set()
43 def handle_starttag(self, tag, attrs):
44 """
45 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method
46 extracts the blob key from the "src" attribute and adds it to the "blobs" set.
48 :param str tag: The current start tag encountered by the parser.
49 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
50 """
51 if tag in ["a", "img"]:
52 for k, v in attrs:
53 if k == "src":
54 file = getattr(conf.main_app.vi, "file", None)
55 if file and (filepath := file.parse_download_url(v)):
56 self.blobs.add(filepath.dlkey)
59class HtmlSerializer(HTMLParser):
60 """
61 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content
62 by removing invalid tags and attributes while retaining the valid ones.
64 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes.
65 :param dict srcSet: A dictionary containing width and height for srcset attribute processing.
66 """
67 __html_serializer_trans = str.maketrans(
68 {"<": "<",
69 ">": ">",
70 "\"": """,
71 "'": "'",
72 "\n": "",
73 "\0": ""})
75 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True):
76 super().__init__(convert_charrefs=convert_charrefs)
77 self.result = "" # The final result that will be returned
78 self.openTagsList = [] # List of tags that still need to be closed
79 self.tagCache = [] # Tuple of tags that have been processed but not written yet
80 self.validHtml = validHtml
81 self.srcSet = srcSet
83 def handle_data(self, data):
84 """
85 Handles the data encountered in the HTML content being parsed. Escapes special characters
86 and appends the data to the result if it is not only whitespace characters.
88 :param str data: The data encountered by the parser.
89 """
90 data = str(data).translate(HtmlSerializer.__html_serializer_trans)
91 if data.strip():
92 self.flushCache()
93 self.result += data
95 def handle_charref(self, name):
96 """
97 Handles character references in the HTML content being parsed and appends the character reference to the
98 result.
100 :param str name: The name of the character reference.
101 """
102 self.flushCache()
103 self.result += f"&#{name};"
105 def handle_entityref(self, name): # FIXME
106 """
107 Handles entity references in the HTML content being parsed and appends the entity reference to the result.
109 :param str name: The name of the entity reference.
110 """
111 if name in html.entities.entitydefs.keys(): 111 ↛ exitline 111 didn't return from function 'handle_entityref' because the condition on line 111 was always true
112 self.flushCache()
113 self.result += f"&{name};"
115 def flushCache(self):
116 """
117 Flush pending tags into the result and push their corresponding end-tags onto the stack
118 """
119 for start, end in self.tagCache:
120 self.result += start
121 self.openTagsList.insert(0, end)
122 self.tagCache = []
124 def handle_starttag(self, tag, attrs):
125 """
126 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and
127 processes valid ones.
129 :param str tag: The current start tag encountered by the parser.
130 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
131 """
132 filterChars = "\"'\\\0\r\n@()"
133 if self.validHtml and tag in self.validHtml["validTags"]:
134 cacheTagStart = '<' + tag
135 isBlankTarget = False
136 styles = None
137 classes = None
138 for k, v in attrs:
139 k = k.strip()
140 v = v.strip()
141 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):
142 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was never true
143 # If we have a title or href attribute, ignore @ and ()
144 pass
145 else:
146 # Either the key or the value contains a character that's not supposed to be there
147 continue
148 elif k == "class": 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was never true
149 # Classes are handled below
150 classes = v.split(" ")
151 continue
152 elif k == "style": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was never true
153 # Styles are handled below
154 styles = v.split(";")
155 continue
156 elif k == "src": 156 ↛ 181line 156 didn't jump to line 181 because the condition on line 156 was always true
157 # We ensure that any src tag starts with an actual url
158 checker = v.lower()
159 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 continue
162 file = getattr(conf.main_app.vi, "file", None)
163 if file and (filepath := file.parse_download_url(v)): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 v = file.create_download_url(
165 filepath.dlkey,
166 filepath.filename,
167 filepath.is_derived,
168 expires=None
169 )
171 if self.srcSet:
172 # Build the src set with files already available. If a derived file is not yet build,
173 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards.
174 srcSet = file.create_src_set(
175 filepath.dlkey,
176 None,
177 self.srcSet.get("width"),
178 self.srcSet.get("height")
179 )
180 cacheTagStart += f' srcSet="{srcSet}"'
181 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true
182 # That attribute is not valid on this tag
183 continue
184 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true
185 cacheTagStart += f' {k}="{v}"'
186 if tag == "a" and k == "target" and v.lower() == "_blank": 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true
187 isBlankTarget = True
188 if styles: 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true
189 syleRes = {}
190 for s in styles:
191 style = s[: s.find(":")].strip()
192 value = s[s.find(":") + 1:].strip()
193 if any([c in style for c in filterChars]) or any(
194 [c in value for c in filterChars]):
195 # Either the key or the value contains a character that's not supposed to be there
196 continue
197 if value.lower().startswith("expression") or value.lower().startswith("import"):
198 # IE evaluates JS inside styles if the keyword expression is present
199 continue
200 if style in self.validHtml["validStyles"] and not any(
201 [(x in value) for x in ["\"", ":", ";"]]):
202 syleRes[style] = value
203 if len(syleRes.keys()):
204 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\""""
205 if classes: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 validClasses = []
207 for currentClass in classes:
208 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"
209 if not all([x in validClassChars for x in currentClass]):
210 # The class contains invalid characters
211 continue
212 isOkay = False
213 for validClass in self.validHtml["validClasses"]:
214 # Check if the classname matches or is white-listed by a prefix
215 if validClass == currentClass:
216 isOkay = True
217 break
218 if validClass.endswith("*"):
219 validClass = validClass[:-1]
220 if currentClass.startswith(validClass):
221 isOkay = True
222 break
223 if isOkay:
224 validClasses.append(currentClass)
225 if validClasses:
226 cacheTagStart += f""" class=\"{" ".join(validClasses)}\""""
227 if isBlankTarget: 227 ↛ 229line 227 didn't jump to line 229 because the condition on line 227 was never true
228 # Add rel tag to prevent the browser to pass window.opener around
229 cacheTagStart += " rel=\"noopener noreferrer\""
230 if tag in self.validHtml["singleTags"]:
231 # Single-Tags do have a visual representation; ensure it makes it into the result
232 self.flushCache()
233 self.result += cacheTagStart + '>' # dont need slash in void elements in html5
234 else:
235 # We opened a 'normal' tag; push it on the cache so it can be discarded later if
236 # we detect it has no content
237 cacheTagStart += '>'
238 self.tagCache.append((cacheTagStart, tag))
239 else:
240 self.result += " "
242 def handle_endtag(self, tag):
243 """
244 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones.
246 :param str tag: The current end tag encountered by the parser.
247 """
248 if self.validHtml: 248 ↛ exitline 248 didn't return from function 'handle_endtag' because the condition on line 248 was always true
249 if self.tagCache: 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was never true
250 # Check if that element is still on the cache
251 # and just silently drop the cache up to that point
252 if tag in [x[1] for x in self.tagCache] + self.openTagsList:
253 for tagCache in self.tagCache[::-1]:
254 self.tagCache.remove(tagCache)
255 if tagCache[1] == tag:
256 return
257 if tag in self.openTagsList:
258 # Close all currently open Tags until we reach the current one. If no one is found,
259 # we just close everything and ignore the tag that should have been closed
260 for endTag in self.openTagsList[:]: 260 ↛ exitline 260 didn't return from function 'handle_endtag' because the loop on line 260 didn't complete
261 self.result += f"</{endTag}>"
262 self.openTagsList.remove(endTag)
263 if endTag == tag:
264 break
266 def cleanup(self): # FIXME: vertauschte tags
267 """ Append missing closing tags to the result."""
268 self.flushCache()
269 for tag in self.openTagsList: 269 ↛ 270line 269 didn't jump to line 270 because the loop on line 269 never started
270 endTag = f'</{tag}>'
271 self.result += endTag
273 def sanitize(self, instr):
274 """
275 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones.
277 :param str instr: The input HTML string to be sanitized.
278 :return: The sanitized HTML string.
279 :rtype: str
280 """
281 self.result = ""
282 self.openTagsList = []
283 self.feed(instr)
284 self.close()
285 self.cleanup()
286 return self.result
289class TextBone(BaseBone):
290 """
291 A bone for storing and validating HTML or plain text content. Can be configured to allow
292 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of
293 srcset for embedded images.
295 :param validHtml: A dictionary containing allowed HTML tags and their attributes.
296 Defaults to `conf.bone_html_default_allow`.
297 :param max_length: The maximum allowed length for the content. Defaults to 200000.
298 :param languages: If set, this bone can store a different content for each language
299 :param srcSet: An optional dictionary containing width and height for srcset generation.
300 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
301 :param indexed: Whether the content should be indexed for searching. Defaults to False.
302 :param kwargs: Additional keyword arguments to be passed to the base class constructor.
303 """
305 class __undefinedC__:
306 pass
308 type = "text"
310 def __init__(
311 self,
312 *,
313 validHtml: None | HtmlBoneConfiguration = __undefinedC__,
314 max_length: int = 200000,
315 srcSet: t.Optional[dict[str, list]] = None,
316 indexed: bool = False,
317 **kwargs
318 ):
319 """
320 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow`
321 :param languages: If set, this bone can store a different content for each language
322 :param max_length: Limit content to max_length bytes
323 :param indexed: Must not be set True, unless you limit max_length accordingly
324 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of
325 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
326 """
327 # fixme: Remove in viur-core >= 4
328 if "maxLength" in kwargs: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true
329 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning)
330 max_length = kwargs.pop("maxLength")
331 super().__init__(indexed=indexed, **kwargs)
333 if validHtml == TextBone.__undefinedC__: 333 ↛ 336line 333 didn't jump to line 336 because the condition on line 333 was always true
334 validHtml = conf.bone_html_default_allow
336 self.validHtml = validHtml
337 self.max_length = max_length
338 self.srcSet = srcSet
340 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool):
341 """
342 Serializes a single value of the TextBone instance for storage.
344 This method takes the value as-is without any additional processing, since it's already stored in a format
345 suitable for serialization.
346 """
347 return value
349 def singleValueFromClient(self, value, skel, bone_name, client_data):
350 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise
351 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None
352 else:
353 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)]
355 def getEmptyValue(self):
356 """
357 Returns an empty value for the TextBone instance.
359 This method is used to represent an empty or unset value for the TextBone.
361 return: An empty string.
362 :rtype: str
363 """
364 return ""
366 def isInvalid(self, value):
367 """
368 Checks if the given value is valid for this TextBone instance.
370 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not
371 None and within the maximum length).
373 :param value: The value to be checked for validity.
374 :return: Returns None if the value is valid, or an error message string otherwise.
375 :rtype: Optional[str]
376 """
378 if value == None:
379 return "No value entered"
380 if len(value) > self.max_length: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true
381 return "Maximum length exceeded"
383 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:
384 """
385 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance.
387 This method parses the HTML content of the TextBone to identify embedded images or file hrefs,
388 collects their blob keys, and ensures that they are not deleted even if removed from the file browser,
389 preventing broken links or images in the TextBone content.
391 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
392 :param str name: The name of the TextBone for which to find referenced blobs.
393 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content.
394 :rtype: Set[str]
395 """
397 collector = CollectBlobKeys()
399 for idx, lang, value in self.iter_bone_value(skel, name):
400 if value:
401 collector.feed(value)
403 blob_keys = collector.blobs
405 if blob_keys and self.srcSet:
406 derive_dict = {
407 "thumbnail": [
408 {"width": x} for x in (self.srcSet.get("width") or [])
409 ] + [
410 {"height": x} for x in (self.srcSet.get("height") or [])
411 ]
412 }
413 from viur.core.bones.file import ensureDerived
414 for blob_key in blob_keys:
415 file_obj = db.Query("file").filter("dlkey =", blob_key) \
416 .order(("creationdate", db.SortOrder.Ascending)).getEntry()
417 if file_obj:
418 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"])
420 return blob_keys
422 def refresh(self, skel, boneName) -> None:
423 """
424 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary.
426 This method is useful when the src-set configuration has changed and needs to be applied
427 to the existing HTML content. It re-parses the content and updates the src-set attributes
428 accordingly.
430 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
431 :param str boneName: The name of the TextBone for which to refresh the src-set.
432 """
433 if self.srcSet:
434 val = skel[boneName]
435 if self.languages and isinstance(val, dict):
436 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()}
437 elif not self.languages and isinstance(val, str):
438 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0]
440 def getSearchTags(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:
441 """
442 Extracts search tags from the text content of a TextBone.
444 This method iterates over the values of the TextBone in the given skeleton, and for each non-empty value,
445 it tokenizes the text by lines and words. Then, it adds the lowercase version of each word to a set of
446 search tags, which is returned at the end.
448 :param skel: A SkeletonInstance containing the TextBone.
449 :param name: The name of the TextBone in the skeleton.
450 :return: A set of unique search tags (lowercase words) extracted from the text content of the TextBone.
451 """
452 result = set()
453 for idx, lang, value in self.iter_bone_value(skel, name):
454 if value is None:
455 continue
456 for line in str(value).splitlines():
457 for word in line.split(" "):
458 result.add(word.lower())
459 return result
461 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]:
462 """
463 Retrieves the unique property index values for the TextBone.
465 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear
466 whether each language should be kept distinct or not. Otherwise, it calls the superclass's
467 getUniquePropertyIndexValues method to retrieve the unique property index values.
469 :param valuesCache: A dictionary containing the cached values for the TextBone.
470 :param name: The name of the TextBone.
471 :return: A list of unique property index values for the TextBone.
472 :raises NotImplementedError: If the TextBone supports multiple languages.
473 """
474 if self.languages:
475 # Not yet implemented as it's unclear if we should keep each language distinct or not
476 raise NotImplementedError()
478 return super().getUniquePropertyIndexValues(valuesCache, name)
480 def structure(self) -> dict:
481 return super().structure() | {
482 "valid_html": self.validHtml,
483 }