Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 46%

214 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2024-10-16 22:16 +0000

1""" 

2The `text` module contains the `Textbone` and a custom HTML-Parser 

3to validate and extract client data for the `TextBone`. 

4""" 

5import string 

6import warnings 

7from base64 import urlsafe_b64decode 

8from datetime import datetime 

9from html import entities as htmlentitydefs 

10from html.parser import HTMLParser 

11import typing as t 

12 

13from viur.core import db, conf 

14from viur.core.bones.base import BaseBone, ReadFromClientError, ReadFromClientErrorSeverity 

15 

16_defaultTags = { 

17 "validTags": [ # List of HTML-Tags which are valid 

18 'b', 'a', 'i', 'u', 'span', 'div', 'p', 'img', 'ol', 'ul', 'li', 'abbr', 'sub', 'sup', 

19 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th', 'br', 

20 'hr', 'strong', 'blockquote', 'em'], 

21 "validAttrs": { # Mapping of valid parameters for each tag (if a tag is not listed here: no parameters allowed) 

22 "a": ["href", "target", "title"], 

23 "abbr": ["title"], 

24 "span": ["title"], 

25 "img": ["src", "alt", "title"], # "srcset" must not be in this list. It will be injected by ViUR 

26 "td": ["colspan", "rowspan"], 

27 "p": ["data-indent"], 

28 "blockquote": ["cite"] 

29 }, 

30 "validStyles": [ 

31 "color" 

32 ], # List of CSS-Directives we allow 

33 "validClasses": ["vitxt-*", "viur-txt-*"], # List of valid class-names that are valid 

34 "singleTags": ["br", "img", "hr"] # List of tags, which don't have a corresponding end tag 

35} 

36""" 

37A dictionary containing default configurations for handling HTML content in TextBone instances. 

38 

39- validTags (list[str]): 

40 A list of valid HTML tags allowed in TextBone instances. 

41- validAttrs (dict[str, list[str]]): 

42 A dictionary mapping valid attributes for each tag. If a tag is not listed, no attributes are allowed for that tag. 

43- validStyles (list[str]): 

44 A list of allowed CSS directives for the TextBone instances. 

45- validClasses (list[str]): 

46 A list of valid CSS class names allowed in TextBone instances. 

47- singleTags (list[str]): 

48 A list of self-closing HTML tags that don't have corresponding end tags. 

49""" 

50 

51 

52class CollectBlobKeys(HTMLParser): 

53 """ 

54 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute 

55 of <a> and <img> tags. 

56 """ 

57 

58 def __init__(self): 

59 super(CollectBlobKeys, self).__init__() 

60 self.blobs = set() 

61 

62 def handle_starttag(self, tag, attrs): 

63 """ 

64 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method 

65 extracts the blob key from the "src" attribute and adds it to the "blobs" set. 

66 

67 :param str tag: The current start tag encountered by the parser. 

68 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

69 """ 

70 if tag in ["a", "img"]: 

71 for k, v in attrs: 

72 if k == "src": 

73 file = getattr(conf.main_app.vi, "file", None) 

74 if file and (filepath := file.parse_download_url(v)): 

75 self.blobs.add(filepath.dlkey) 

76 

77 

78class HtmlSerializer(HTMLParser): # html.parser.HTMLParser 

79 """ 

80 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content 

81 by removing invalid tags and attributes while retaining the valid ones. 

82 

83 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes. 

84 :param dict srcSet: A dictionary containing width and height for srcset attribute processing. 

85 """ 

86 __html_serializer_trans = str.maketrans( 

87 {"<": "&lt;", 

88 ">": "&gt;", 

89 "\"": "&quot;", 

90 "'": "&#39;", 

91 "\n": "", 

92 "\0": ""}) 

93 

94 def __init__(self, validHtml=None, srcSet=None, convert_charrefs: bool = True): 

95 super().__init__(convert_charrefs=convert_charrefs) 

96 self.result = "" # The final result that will be returned 

97 self.openTagsList = [] # List of tags that still need to be closed 

98 self.tagCache = [] # Tuple of tags that have been processed but not written yet 

99 self.validHtml = validHtml 

100 self.srcSet = srcSet 

101 

102 def handle_data(self, data): 

103 """ 

104 Handles the data encountered in the HTML content being parsed. Escapes special characters 

105 and appends the data to the result if it is not only whitespace characters. 

106 

107 :param str data: The data encountered by the parser. 

108 """ 

109 data = str(data).translate(HtmlSerializer.__html_serializer_trans) 

110 if data.strip(): 

111 self.flushCache() 

112 self.result += data 

113 

114 def handle_charref(self, name): 

115 """ 

116 Handles character references in the HTML content being parsed and appends the character reference to the 

117 result. 

118 

119 :param str name: The name of the character reference. 

120 """ 

121 self.flushCache() 

122 self.result += f"&#{name};" 

123 

124 def handle_entityref(self, name): # FIXME 

125 """ 

126 Handles entity references in the HTML content being parsed and appends the entity reference to the result. 

127 

128 :param str name: The name of the entity reference. 

129 """ 

130 if name in htmlentitydefs.entitydefs.keys(): 130 ↛ exitline 130 didn't return from function 'handle_entityref' because the condition on line 130 was always true

131 self.flushCache() 

132 self.result += f"&{name};" 

133 

134 def flushCache(self): 

135 """ 

136 Flush pending tags into the result and push their corresponding end-tags onto the stack 

137 """ 

138 for start, end in self.tagCache: 

139 self.result += start 

140 self.openTagsList.insert(0, end) 

141 self.tagCache = [] 

142 

143 def handle_starttag(self, tag, attrs): 

144 """ 

145 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and 

146 processes valid ones. 

147 

148 :param str tag: The current start tag encountered by the parser. 

149 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

150 """ 

151 filterChars = "\"'\\\0\r\n@()" 

152 if self.validHtml and tag in self.validHtml["validTags"]: 

153 cacheTagStart = '<' + tag 

154 isBlankTarget = False 

155 styles = None 

156 classes = None 

157 for k, v in attrs: 

158 k = k.strip() 

159 v = v.strip() 

160 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]): 

161 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was never true

162 # If we have a title or href attribute, ignore @ and () 

163 pass 

164 else: 

165 # Either the key or the value contains a character that's not supposed to be there 

166 continue 

167 elif k == "class": 167 ↛ 169line 167 didn't jump to line 169 because the condition on line 167 was never true

168 # Classes are handled below 

169 classes = v.split(" ") 

170 continue 

171 elif k == "style": 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was never true

172 # Styles are handled below 

173 styles = v.split(";") 

174 continue 

175 elif k == "src": 175 ↛ 200line 175 didn't jump to line 200 because the condition on line 175 was always true

176 # We ensure that any src tag starts with an actual url 

177 checker = v.lower() 

178 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 continue 

180 

181 file = getattr(conf.main_app.vi, "file", None) 

182 if file and (filepath := file.parse_download_url(v)): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 v = file.create_download_url( 

184 filepath.dlkey, 

185 filepath.filename, 

186 filepath.is_derived, 

187 expires=None 

188 ) 

189 

190 if self.srcSet: 

191 # Build the src set with files already available. If a derived file is not yet build, 

192 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards. 

193 srcSet = file.create_src_set( 

194 filepath.dlkey, 

195 None, 

196 self.srcSet.get("width"), 

197 self.srcSet.get("height") 

198 ) 

199 cacheTagStart += f' srcSet="{srcSet}"' 

200 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was never true

201 # That attribute is not valid on this tag 

202 continue 

203 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 203 ↛ 205line 203 didn't jump to line 205 because the condition on line 203 was always true

204 cacheTagStart += f' {k}="{v}"' 

205 if tag == "a" and k == "target" and v.lower() == "_blank": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 isBlankTarget = True 

207 if styles: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 syleRes = {} 

209 for s in styles: 

210 style = s[: s.find(":")].strip() 

211 value = s[s.find(":") + 1:].strip() 

212 if any([c in style for c in filterChars]) or any( 

213 [c in value for c in filterChars]): 

214 # Either the key or the value contains a character that's not supposed to be there 

215 continue 

216 if value.lower().startswith("expression") or value.lower().startswith("import"): 

217 # IE evaluates JS inside styles if the keyword expression is present 

218 continue 

219 if style in self.validHtml["validStyles"] and not any( 

220 [(x in value) for x in ["\"", ":", ";"]]): 

221 syleRes[style] = value 

222 if len(syleRes.keys()): 

223 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\"""" 

224 if classes: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 validClasses = [] 

226 for currentClass in classes: 

227 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-" 

228 if not all([x in validClassChars for x in currentClass]): 

229 # The class contains invalid characters 

230 continue 

231 isOkay = False 

232 for validClass in self.validHtml["validClasses"]: 

233 # Check if the classname matches or is white-listed by a prefix 

234 if validClass == currentClass: 

235 isOkay = True 

236 break 

237 if validClass.endswith("*"): 

238 validClass = validClass[:-1] 

239 if currentClass.startswith(validClass): 

240 isOkay = True 

241 break 

242 if isOkay: 

243 validClasses.append(currentClass) 

244 if validClasses: 

245 cacheTagStart += f""" class=\"{" ".join(validClasses)}\"""" 

246 if isBlankTarget: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was never true

247 # Add rel tag to prevent the browser to pass window.opener around 

248 cacheTagStart += " rel=\"noopener noreferrer\"" 

249 if tag in self.validHtml["singleTags"]: 

250 # Single-Tags do have a visual representation; ensure it makes it into the result 

251 self.flushCache() 

252 self.result += cacheTagStart + '>' # dont need slash in void elements in html5 

253 else: 

254 # We opened a 'normal' tag; push it on the cache so it can be discarded later if 

255 # we detect it has no content 

256 cacheTagStart += '>' 

257 self.tagCache.append((cacheTagStart, tag)) 

258 else: 

259 self.result += " " 

260 

261 def handle_endtag(self, tag): 

262 """ 

263 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones. 

264 

265 :param str tag: The current end tag encountered by the parser. 

266 """ 

267 if self.validHtml: 267 ↛ exitline 267 didn't return from function 'handle_endtag' because the condition on line 267 was always true

268 if self.tagCache: 268 ↛ 271line 268 didn't jump to line 271 because the condition on line 268 was never true

269 # Check if that element is still on the cache 

270 # and just silently drop the cache up to that point 

271 if tag in [x[1] for x in self.tagCache] + self.openTagsList: 

272 for tagCache in self.tagCache[::-1]: 

273 self.tagCache.remove(tagCache) 

274 if tagCache[1] == tag: 

275 return 

276 if tag in self.openTagsList: 

277 # Close all currently open Tags until we reach the current one. If no one is found, 

278 # we just close everything and ignore the tag that should have been closed 

279 for endTag in self.openTagsList[:]: 279 ↛ exitline 279 didn't return from function 'handle_endtag' because the loop on line 279 didn't complete

280 self.result += f"</{endTag}>" 

281 self.openTagsList.remove(endTag) 

282 if endTag == tag: 

283 break 

284 

285 def cleanup(self): # FIXME: vertauschte tags 

286 """ Append missing closing tags to the result.""" 

287 self.flushCache() 

288 for tag in self.openTagsList: 288 ↛ 289line 288 didn't jump to line 289 because the loop on line 288 never started

289 endTag = f'</{tag}>' 

290 self.result += endTag 

291 

292 def sanitize(self, instr): 

293 """ 

294 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones. 

295 

296 :param str instr: The input HTML string to be sanitized. 

297 :return: The sanitized HTML string. 

298 :rtype: str 

299 """ 

300 self.result = "" 

301 self.openTagsList = [] 

302 self.feed(instr) 

303 self.close() 

304 self.cleanup() 

305 return self.result 

306 

307 

308class TextBone(BaseBone): 

309 """ 

310 A bone for storing and validating HTML or plain text content. Can be configured to allow 

311 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of 

312 srcset for embedded images. 

313 

314 :param Union[None, Dict] validHtml: A dictionary containing allowed HTML tags and their attributes. Defaults 

315 to _defaultTags. Must be a structured like :prop:_defaultTags 

316 :param int max_length: The maximum allowed length for the content. Defaults to 200000. 

317 :param languages: If set, this bone can store a different content for each language 

318 :param Dict[str, List] srcSet: An optional dictionary containing width and height for srcset generation. 

319 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

320 :param bool indexed: Whether the content should be indexed for searching. Defaults to False. 

321 :param kwargs: Additional keyword arguments to be passed to the base class constructor. 

322 """ 

323 

324 class __undefinedC__: 

325 pass 

326 

327 type = "text" 

328 

329 def __init__( 

330 self, 

331 *, 

332 validHtml: None | dict = __undefinedC__, 

333 max_length: int = 200000, 

334 srcSet: t.Optional[dict[str, list]] = None, 

335 indexed: bool = False, 

336 **kwargs 

337 ): 

338 """ 

339 :param validHtml: If set, must be a structure like :prop:_defaultTags 

340 :param languages: If set, this bone can store a different content for each language 

341 :param max_length: Limit content to max_length bytes 

342 :param indexed: Must not be set True, unless you limit max_length accordingly 

343 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of 

344 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

345 """ 

346 # fixme: Remove in viur-core >= 4 

347 if "maxLength" in kwargs: 347 ↛ 348line 347 didn't jump to line 348 because the condition on line 347 was never true

348 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning) 

349 max_length = kwargs.pop("maxLength") 

350 super().__init__(indexed=indexed, **kwargs) 

351 

352 if validHtml == TextBone.__undefinedC__: 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was always true

353 global _defaultTags 

354 validHtml = _defaultTags 

355 

356 self.validHtml = validHtml 

357 self.max_length = max_length 

358 self.srcSet = srcSet 

359 

360 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool): 

361 """ 

362 Serializes a single value of the TextBone instance for storage. 

363 

364 This method takes the value as-is without any additional processing, since it's already stored in a format 

365 suitable for serialization. 

366 """ 

367 return value 

368 

369 def singleValueFromClient(self, value, skel, bone_name, client_data): 

370 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise 

371 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None 

372 else: 

373 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)] 

374 

375 def getEmptyValue(self): 

376 """ 

377 Returns an empty value for the TextBone instance. 

378 

379 This method is used to represent an empty or unset value for the TextBone. 

380 

381 return: An empty string. 

382 :rtype: str 

383 """ 

384 return "" 

385 

386 def isInvalid(self, value): 

387 """ 

388 Checks if the given value is valid for this TextBone instance. 

389 

390 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not 

391 None and within the maximum length). 

392 

393 :param value: The value to be checked for validity. 

394 :return: Returns None if the value is valid, or an error message string otherwise. 

395 :rtype: Optional[str] 

396 """ 

397 

398 if value == None: 

399 return "No value entered" 

400 if len(value) > self.max_length: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 return "Maximum length exceeded" 

402 

403 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]: 

404 """ 

405 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance. 

406 

407 This method parses the HTML content of the TextBone to identify embedded images or file hrefs, 

408 collects their blob keys, and ensures that they are not deleted even if removed from the file browser, 

409 preventing broken links or images in the TextBone content. 

410 

411 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

412 :param str name: The name of the TextBone for which to find referenced blobs. 

413 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content. 

414 :rtype: Set[str] 

415 """ 

416 

417 collector = CollectBlobKeys() 

418 

419 for idx, lang, value in self.iter_bone_value(skel, name): 

420 if value: 

421 collector.feed(value) 

422 

423 blob_keys = collector.blobs 

424 

425 if blob_keys and self.srcSet: 

426 derive_dict = { 

427 "thumbnail": [ 

428 {"width": x} for x in (self.srcSet.get("width") or []) 

429 ] + [ 

430 {"height": x} for x in (self.srcSet.get("height") or []) 

431 ] 

432 } 

433 from viur.core.bones.file import ensureDerived 

434 for blob_key in blob_keys: 

435 file_obj = db.Query("file").filter("dlkey =", blob_key) \ 

436 .order(("creationdate", db.SortOrder.Ascending)).getEntry() 

437 if file_obj: 

438 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"]) 

439 

440 return blob_keys 

441 

442 def refresh(self, skel, boneName) -> None: 

443 """ 

444 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary. 

445 

446 This method is useful when the src-set configuration has changed and needs to be applied 

447 to the existing HTML content. It re-parses the content and updates the src-set attributes 

448 accordingly. 

449 

450 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

451 :param str boneName: The name of the TextBone for which to refresh the src-set. 

452 """ 

453 if self.srcSet: 

454 val = skel[boneName] 

455 if self.languages and isinstance(val, dict): 

456 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()} 

457 elif not self.languages and isinstance(val, str): 

458 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0] 

459 

460 def getSearchTags(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]: 

461 """ 

462 Extracts search tags from the text content of a TextBone. 

463 

464 This method iterates over the values of the TextBone in the given skeleton, and for each non-empty value, 

465 it tokenizes the text by lines and words. Then, it adds the lowercase version of each word to a set of 

466 search tags, which is returned at the end. 

467 

468 :param skel: A SkeletonInstance containing the TextBone. 

469 :param name: The name of the TextBone in the skeleton. 

470 :return: A set of unique search tags (lowercase words) extracted from the text content of the TextBone. 

471 """ 

472 result = set() 

473 for idx, lang, value in self.iter_bone_value(skel, name): 

474 if value is None: 

475 continue 

476 for line in str(value).splitlines(): 

477 for word in line.split(" "): 

478 result.add(word.lower()) 

479 return result 

480 

481 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]: 

482 """ 

483 Retrieves the unique property index values for the TextBone. 

484 

485 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear 

486 whether each language should be kept distinct or not. Otherwise, it calls the superclass's 

487 getUniquePropertyIndexValues method to retrieve the unique property index values. 

488 

489 :param valuesCache: A dictionary containing the cached values for the TextBone. 

490 :param name: The name of the TextBone. 

491 :return: A list of unique property index values for the TextBone. 

492 :raises NotImplementedError: If the TextBone supports multiple languages. 

493 """ 

494 if self.languages: 

495 # Not yet implemented as it's unclear if we should keep each language distinct or not 

496 raise NotImplementedError() 

497 

498 return super().getUniquePropertyIndexValues(valuesCache, name) 

499 

500 def structure(self) -> dict: 

501 return super().structure() | { 

502 "valid_html": self.validHtml, 

503 }