Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 47%

221 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-02-07 19:28 +0000

1""" 

2The `text` module contains the `Textbone` and a custom HTML-Parser 

3to validate and extract client data for the `TextBone`. 

4""" 

5import html 

6import string 

7import typing as t 

8import warnings 

9from html.parser import HTMLParser 

10from viur.core import db, conf 

11from viur.core.bones.base import BaseBone, ReadFromClientError, ReadFromClientErrorSeverity 

12 

13 

14class HtmlBoneConfiguration(t.TypedDict): 

15 """A dictionary containing configurations for handling HTML content in TextBone instances.""" 

16 

17 validTags: list[str] 

18 """A list of valid HTML tags allowed in TextBone instances.""" 

19 

20 validAttrs: dict[str, list[str]] 

21 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes.""" 

22 

23 validStyles: list[str] 

24 """A list of allowed CSS directives for the TextBone instances.""" 

25 

26 validClasses: list[str] 

27 """A list of valid CSS class names allowed in TextBone instances.""" 

28 

29 singleTags: list[str] 

30 """A list of self-closing HTML tags that don't have corresponding end tags.""" 

31 

32 

33class CollectBlobKeys(HTMLParser): 

34 """ 

35 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute 

36 of <a> and <img> tags. 

37 """ 

38 

39 def __init__(self): 

40 super(CollectBlobKeys, self).__init__() 

41 self.blobs = set() 

42 

43 def handle_starttag(self, tag, attrs): 

44 """ 

45 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method 

46 extracts the blob key from the "src" attribute and adds it to the "blobs" set. 

47 

48 :param str tag: The current start tag encountered by the parser. 

49 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

50 """ 

51 if tag in ["a", "img"]: 

52 for k, v in attrs: 

53 if k == "src": 

54 file = getattr(conf.main_app.vi, "file", None) 

55 if file and (filepath := file.parse_download_url(v)): 

56 self.blobs.add(filepath.dlkey) 

57 

58 

59class HtmlSerializer(HTMLParser): 

60 """ 

61 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content 

62 by removing invalid tags and attributes while retaining the valid ones. 

63 

64 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes. 

65 :param dict srcSet: A dictionary containing width and height for srcset attribute processing. 

66 """ 

67 __html_serializer_trans = str.maketrans( 

68 {"<": "&lt;", 

69 ">": "&gt;", 

70 "\"": "&quot;", 

71 "'": "&#39;", 

72 "\n": "", 

73 "\0": ""}) 

74 

75 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True): 

76 super().__init__(convert_charrefs=convert_charrefs) 

77 self.result = "" # The final result that will be returned 

78 self.openTagsList = [] # List of tags that still need to be closed 

79 self.tagCache = [] # Tuple of tags that have been processed but not written yet 

80 self.validHtml = validHtml 

81 self.srcSet = srcSet 

82 

83 def handle_data(self, data): 

84 """ 

85 Handles the data encountered in the HTML content being parsed. Escapes special characters 

86 and appends the data to the result if it is not only whitespace characters. 

87 

88 :param str data: The data encountered by the parser. 

89 """ 

90 data = str(data).translate(HtmlSerializer.__html_serializer_trans) 

91 if data.strip(): 

92 self.flushCache() 

93 self.result += data 

94 

95 def handle_charref(self, name): 

96 """ 

97 Handles character references in the HTML content being parsed and appends the character reference to the 

98 result. 

99 

100 :param str name: The name of the character reference. 

101 """ 

102 self.flushCache() 

103 self.result += f"&#{name};" 

104 

105 def handle_entityref(self, name): # FIXME 

106 """ 

107 Handles entity references in the HTML content being parsed and appends the entity reference to the result. 

108 

109 :param str name: The name of the entity reference. 

110 """ 

111 if name in html.entities.entitydefs.keys(): 111 ↛ exitline 111 didn't return from function 'handle_entityref' because the condition on line 111 was always true

112 self.flushCache() 

113 self.result += f"&{name};" 

114 

115 def flushCache(self): 

116 """ 

117 Flush pending tags into the result and push their corresponding end-tags onto the stack 

118 """ 

119 for start, end in self.tagCache: 

120 self.result += start 

121 self.openTagsList.insert(0, end) 

122 self.tagCache = [] 

123 

124 def handle_starttag(self, tag, attrs): 

125 """ 

126 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and 

127 processes valid ones. 

128 

129 :param str tag: The current start tag encountered by the parser. 

130 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

131 """ 

132 filterChars = "\"'\\\0\r\n@()" 

133 if self.validHtml and tag in self.validHtml["validTags"]: 

134 cacheTagStart = '<' + tag 

135 isBlankTarget = False 

136 styles = None 

137 classes = None 

138 for k, v in attrs: 

139 k = k.strip() 

140 v = v.strip() 

141 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]): 

142 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was never true

143 # If we have a title or href attribute, ignore @ and () 

144 pass 

145 else: 

146 # Either the key or the value contains a character that's not supposed to be there 

147 continue 

148 elif k == "class": 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was never true

149 # Classes are handled below 

150 classes = v.split(" ") 

151 continue 

152 elif k == "style": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was never true

153 # Styles are handled below 

154 styles = v.split(";") 

155 continue 

156 elif k == "src": 156 ↛ 181line 156 didn't jump to line 181 because the condition on line 156 was always true

157 # We ensure that any src tag starts with an actual url 

158 checker = v.lower() 

159 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 continue 

161 

162 file = getattr(conf.main_app.vi, "file", None) 

163 if file and (filepath := file.parse_download_url(v)): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 v = file.create_download_url( 

165 filepath.dlkey, 

166 filepath.filename, 

167 filepath.is_derived, 

168 expires=None 

169 ) 

170 

171 if self.srcSet: 

172 # Build the src set with files already available. If a derived file is not yet build, 

173 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards. 

174 srcSet = file.create_src_set( 

175 filepath.dlkey, 

176 None, 

177 self.srcSet.get("width"), 

178 self.srcSet.get("height") 

179 ) 

180 cacheTagStart += f' srcSet="{srcSet}"' 

181 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true

182 # That attribute is not valid on this tag 

183 continue 

184 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true

185 cacheTagStart += f' {k}="{v}"' 

186 if tag == "a" and k == "target" and v.lower() == "_blank": 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 isBlankTarget = True 

188 if styles: 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true

189 syleRes = {} 

190 for s in styles: 

191 style = s[: s.find(":")].strip() 

192 value = s[s.find(":") + 1:].strip() 

193 if any([c in style for c in filterChars]) or any( 

194 [c in value for c in filterChars]): 

195 # Either the key or the value contains a character that's not supposed to be there 

196 continue 

197 if value.lower().startswith("expression") or value.lower().startswith("import"): 

198 # IE evaluates JS inside styles if the keyword expression is present 

199 continue 

200 if style in self.validHtml["validStyles"] and not any( 

201 [(x in value) for x in ["\"", ":", ";"]]): 

202 syleRes[style] = value 

203 if len(syleRes.keys()): 

204 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\"""" 

205 if classes: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 validClasses = [] 

207 for currentClass in classes: 

208 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-" 

209 if not all([x in validClassChars for x in currentClass]): 

210 # The class contains invalid characters 

211 continue 

212 isOkay = False 

213 for validClass in self.validHtml["validClasses"]: 

214 # Check if the classname matches or is white-listed by a prefix 

215 if validClass == currentClass: 

216 isOkay = True 

217 break 

218 if validClass.endswith("*"): 

219 validClass = validClass[:-1] 

220 if currentClass.startswith(validClass): 

221 isOkay = True 

222 break 

223 if isOkay: 

224 validClasses.append(currentClass) 

225 if validClasses: 

226 cacheTagStart += f""" class=\"{" ".join(validClasses)}\"""" 

227 if isBlankTarget: 227 ↛ 229line 227 didn't jump to line 229 because the condition on line 227 was never true

228 # Add rel tag to prevent the browser to pass window.opener around 

229 cacheTagStart += " rel=\"noopener noreferrer\"" 

230 if tag in self.validHtml["singleTags"]: 

231 # Single-Tags do have a visual representation; ensure it makes it into the result 

232 self.flushCache() 

233 self.result += cacheTagStart + '>' # dont need slash in void elements in html5 

234 else: 

235 # We opened a 'normal' tag; push it on the cache so it can be discarded later if 

236 # we detect it has no content 

237 cacheTagStart += '>' 

238 self.tagCache.append((cacheTagStart, tag)) 

239 else: 

240 self.result += " " 

241 

242 def handle_endtag(self, tag): 

243 """ 

244 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones. 

245 

246 :param str tag: The current end tag encountered by the parser. 

247 """ 

248 if self.validHtml: 248 ↛ exitline 248 didn't return from function 'handle_endtag' because the condition on line 248 was always true

249 if self.tagCache: 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was never true

250 # Check if that element is still on the cache 

251 # and just silently drop the cache up to that point 

252 if tag in [x[1] for x in self.tagCache] + self.openTagsList: 

253 for tagCache in self.tagCache[::-1]: 

254 self.tagCache.remove(tagCache) 

255 if tagCache[1] == tag: 

256 return 

257 if tag in self.openTagsList: 

258 # Close all currently open Tags until we reach the current one. If no one is found, 

259 # we just close everything and ignore the tag that should have been closed 

260 for endTag in self.openTagsList[:]: 260 ↛ exitline 260 didn't return from function 'handle_endtag' because the loop on line 260 didn't complete

261 self.result += f"</{endTag}>" 

262 self.openTagsList.remove(endTag) 

263 if endTag == tag: 

264 break 

265 

266 def cleanup(self): # FIXME: vertauschte tags 

267 """ Append missing closing tags to the result.""" 

268 self.flushCache() 

269 for tag in self.openTagsList: 269 ↛ 270line 269 didn't jump to line 270 because the loop on line 269 never started

270 endTag = f'</{tag}>' 

271 self.result += endTag 

272 

273 def sanitize(self, instr): 

274 """ 

275 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones. 

276 

277 :param str instr: The input HTML string to be sanitized. 

278 :return: The sanitized HTML string. 

279 :rtype: str 

280 """ 

281 self.result = "" 

282 self.openTagsList = [] 

283 self.feed(instr) 

284 self.close() 

285 self.cleanup() 

286 return self.result 

287 

288 

289class TextBone(BaseBone): 

290 """ 

291 A bone for storing and validating HTML or plain text content. Can be configured to allow 

292 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of 

293 srcset for embedded images. 

294 

295 :param validHtml: A dictionary containing allowed HTML tags and their attributes. 

296 Defaults to `conf.bone_html_default_allow`. 

297 :param max_length: The maximum allowed length for the content. Defaults to 200000. 

298 :param languages: If set, this bone can store a different content for each language 

299 :param srcSet: An optional dictionary containing width and height for srcset generation. 

300 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

301 :param indexed: Whether the content should be indexed for searching. Defaults to False. 

302 :param kwargs: Additional keyword arguments to be passed to the base class constructor. 

303 """ 

304 

305 class __undefinedC__: 

306 pass 

307 

308 type = "text" 

309 

310 def __init__( 

311 self, 

312 *, 

313 validHtml: None | HtmlBoneConfiguration = __undefinedC__, 

314 max_length: int = 200000, 

315 srcSet: t.Optional[dict[str, list]] = None, 

316 indexed: bool = False, 

317 **kwargs 

318 ): 

319 """ 

320 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow` 

321 :param languages: If set, this bone can store a different content for each language 

322 :param max_length: Limit content to max_length bytes 

323 :param indexed: Must not be set True, unless you limit max_length accordingly 

324 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of 

325 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

326 """ 

327 # fixme: Remove in viur-core >= 4 

328 if "maxLength" in kwargs: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true

329 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning) 

330 max_length = kwargs.pop("maxLength") 

331 super().__init__(indexed=indexed, **kwargs) 

332 

333 if validHtml == TextBone.__undefinedC__: 333 ↛ 336line 333 didn't jump to line 336 because the condition on line 333 was always true

334 validHtml = conf.bone_html_default_allow 

335 

336 self.validHtml = validHtml 

337 self.max_length = max_length 

338 self.srcSet = srcSet 

339 

340 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool): 

341 """ 

342 Serializes a single value of the TextBone instance for storage. 

343 

344 This method takes the value as-is without any additional processing, since it's already stored in a format 

345 suitable for serialization. 

346 """ 

347 return value 

348 

349 def singleValueFromClient(self, value, skel, bone_name, client_data): 

350 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise 

351 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None 

352 else: 

353 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)] 

354 

355 def getEmptyValue(self): 

356 """ 

357 Returns an empty value for the TextBone instance. 

358 

359 This method is used to represent an empty or unset value for the TextBone. 

360 

361 return: An empty string. 

362 :rtype: str 

363 """ 

364 return "" 

365 

366 def isInvalid(self, value): 

367 """ 

368 Checks if the given value is valid for this TextBone instance. 

369 

370 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not 

371 None and within the maximum length). 

372 

373 :param value: The value to be checked for validity. 

374 :return: Returns None if the value is valid, or an error message string otherwise. 

375 :rtype: Optional[str] 

376 """ 

377 

378 if value == None: 

379 return "No value entered" 

380 if len(value) > self.max_length: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true

381 return "Maximum length exceeded" 

382 

383 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]: 

384 """ 

385 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance. 

386 

387 This method parses the HTML content of the TextBone to identify embedded images or file hrefs, 

388 collects their blob keys, and ensures that they are not deleted even if removed from the file browser, 

389 preventing broken links or images in the TextBone content. 

390 

391 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

392 :param str name: The name of the TextBone for which to find referenced blobs. 

393 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content. 

394 :rtype: Set[str] 

395 """ 

396 

397 collector = CollectBlobKeys() 

398 

399 for idx, lang, value in self.iter_bone_value(skel, name): 

400 if value: 

401 collector.feed(value) 

402 

403 blob_keys = collector.blobs 

404 

405 if blob_keys and self.srcSet: 

406 derive_dict = { 

407 "thumbnail": [ 

408 {"width": x} for x in (self.srcSet.get("width") or []) 

409 ] + [ 

410 {"height": x} for x in (self.srcSet.get("height") or []) 

411 ] 

412 } 

413 from viur.core.bones.file import ensureDerived 

414 for blob_key in blob_keys: 

415 file_obj = db.Query("file").filter("dlkey =", blob_key) \ 

416 .order(("creationdate", db.SortOrder.Ascending)).getEntry() 

417 if file_obj: 

418 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"]) 

419 

420 return blob_keys 

421 

422 def refresh(self, skel, boneName) -> None: 

423 """ 

424 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary. 

425 

426 This method is useful when the src-set configuration has changed and needs to be applied 

427 to the existing HTML content. It re-parses the content and updates the src-set attributes 

428 accordingly. 

429 

430 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

431 :param str boneName: The name of the TextBone for which to refresh the src-set. 

432 """ 

433 if self.srcSet: 

434 val = skel[boneName] 

435 if self.languages and isinstance(val, dict): 

436 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()} 

437 elif not self.languages and isinstance(val, str): 

438 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0] 

439 

440 def getSearchTags(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]: 

441 """ 

442 Extracts search tags from the text content of a TextBone. 

443 

444 This method iterates over the values of the TextBone in the given skeleton, and for each non-empty value, 

445 it tokenizes the text by lines and words. Then, it adds the lowercase version of each word to a set of 

446 search tags, which is returned at the end. 

447 

448 :param skel: A SkeletonInstance containing the TextBone. 

449 :param name: The name of the TextBone in the skeleton. 

450 :return: A set of unique search tags (lowercase words) extracted from the text content of the TextBone. 

451 """ 

452 result = set() 

453 for idx, lang, value in self.iter_bone_value(skel, name): 

454 if value is None: 

455 continue 

456 for line in str(value).splitlines(): 

457 for word in line.split(" "): 

458 result.add(word.lower()) 

459 return result 

460 

461 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]: 

462 """ 

463 Retrieves the unique property index values for the TextBone. 

464 

465 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear 

466 whether each language should be kept distinct or not. Otherwise, it calls the superclass's 

467 getUniquePropertyIndexValues method to retrieve the unique property index values. 

468 

469 :param valuesCache: A dictionary containing the cached values for the TextBone. 

470 :param name: The name of the TextBone. 

471 :return: A list of unique property index values for the TextBone. 

472 :raises NotImplementedError: If the TextBone supports multiple languages. 

473 """ 

474 if self.languages: 

475 # Not yet implemented as it's unclear if we should keep each language distinct or not 

476 raise NotImplementedError() 

477 

478 return super().getUniquePropertyIndexValues(valuesCache, name) 

479 

480 def structure(self) -> dict: 

481 return super().structure() | { 

482 "valid_html": self.validHtml, 

483 }