Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 47%

1"""

2The `text` module contains the `Textbone` and a custom HTML-Parser

3to validate and extract client data for the `TextBone`.

4"""

5import html

6import string

7import typing as t

8import warnings

9from html.parser import HTMLParser

10from viur.core import db, conf

11from viur.core.bones.base import BaseBone, ReadFromClientError, ReadFromClientErrorSeverity

14class HtmlBoneConfiguration(t.TypedDict):

15 """A dictionary containing configurations for handling HTML content in TextBone instances."""

17 validTags: list[str]

18 """A list of valid HTML tags allowed in TextBone instances."""

20 validAttrs: dict[str, list[str]]

21 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes."""

23 validStyles: list[str]

24 """A list of allowed CSS directives for the TextBone instances."""

26 validClasses: list[str]

27 """A list of valid CSS class names allowed in TextBone instances."""

29 singleTags: list[str]

30 """A list of self-closing HTML tags that don't have corresponding end tags."""

33class CollectBlobKeys(HTMLParser):

34 """

35 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute

36 of <a> and <img> tags.

37 """

39 def __init__(self):

40 super(CollectBlobKeys, self).__init__()

41 self.blobs = set()

43 def handle_starttag(self, tag, attrs):

44 """

45 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method

46 extracts the blob key from the "src" attribute and adds it to the "blobs" set.

48 :param str tag: The current start tag encountered by the parser.

49 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.

50 """

51 if tag in ["a", "img"]:

52 for k, v in attrs:

53 if k == "src":

54 file = getattr(conf.main_app.vi, "file", None)

55 if file and (filepath := file.parse_download_url(v)):

56 self.blobs.add(filepath.dlkey)

59class HtmlSerializer(HTMLParser):

60 """

61 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content

62 by removing invalid tags and attributes while retaining the valid ones.

64 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes.

65 :param dict srcSet: A dictionary containing width and height for srcset attribute processing.

66 """

67 __html_serializer_trans = str.maketrans(

68 {"<": "<",

69 ">": ">",

70 "\"": """,

71 "'": "'",

72 "\n": "",

73 "\0": ""})

75 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True):

76 super().__init__(convert_charrefs=convert_charrefs)

77 self.result = "" # The final result that will be returned

78 self.openTagsList = [] # List of tags that still need to be closed

79 self.tagCache = [] # Tuple of tags that have been processed but not written yet

80 self.validHtml = validHtml

81 self.srcSet = srcSet

83 def handle_data(self, data):

84 """

85 Handles the data encountered in the HTML content being parsed. Escapes special characters

86 and appends the data to the result if it is not only whitespace characters.

88 :param str data: The data encountered by the parser.

89 """

90 data = str(data).translate(HtmlSerializer.__html_serializer_trans)

91 if data.strip():

92 self.flushCache()

93 self.result += data

95 def handle_charref(self, name):

96 """

97 Handles character references in the HTML content being parsed and appends the character reference to the

98 result.

100 :param str name: The name of the character reference.

101 """

102 self.flushCache()

103 self.result += f"&#{name};"

104

105 def handle_entityref(self, name): # FIXME

106 """

107 Handles entity references in the HTML content being parsed and appends the entity reference to the result.

108

109 :param str name: The name of the entity reference.

110 """

111 if name in html.entities.entitydefs.keys(): 111 ↛ exitline 111 didn't return from function 'handle_entityref' because the condition on line 111 was always true

112 self.flushCache()

113 self.result += f"&{name};"

114

115 def flushCache(self):

116 """

117 Flush pending tags into the result and push their corresponding end-tags onto the stack

118 """

119 for start, end in self.tagCache:

120 self.result += start

121 self.openTagsList.insert(0, end)

122 self.tagCache = []

123

124 def handle_starttag(self, tag, attrs):

125 """

126 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and

127 processes valid ones.

128

129 :param str tag: The current start tag encountered by the parser.

130 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.

131 """

132 filterChars = "\"'\\\0\r\n@()"

133 if self.validHtml and tag in self.validHtml["validTags"]:

134 cacheTagStart = '<' + tag

135 isBlankTarget = False

136 styles = None

137 classes = None

138 for k, v in attrs:

139 k = k.strip()

140 v = v.strip()

141 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):

142 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was never true

143 # If we have a title or href attribute, ignore @ and ()

144 pass

145 else:

146 # Either the key or the value contains a character that's not supposed to be there

147 continue

148 elif k == "class": 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was never true

149 # Classes are handled below

150 classes = v.split(" ")

151 continue

152 elif k == "style": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was never true

153 # Styles are handled below

154 styles = v.split(";")

155 continue

156 elif k == "src": 156 ↛ 181line 156 didn't jump to line 181 because the condition on line 156 was always true

157 # We ensure that any src tag starts with an actual url

158 checker = v.lower()

159 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 continue

161

162 file = getattr(conf.main_app.vi, "file", None)

163 if file and (filepath := file.parse_download_url(v)): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 v = file.create_download_url(

165 filepath.dlkey,

166 filepath.filename,

167 filepath.is_derived,

168 expires=None

169 )

170

171 if self.srcSet:

172 # Build the src set with files already available. If a derived file is not yet build,

173 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards.

174 srcSet = file.create_src_set(

175 filepath.dlkey,

176 None,

177 self.srcSet.get("width"),

178 self.srcSet.get("height")

179 )

180 cacheTagStart += f' srcSet="{srcSet}"'

181 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true

182 # That attribute is not valid on this tag

183 continue

184 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true

185 cacheTagStart += f' {k}="{v}"'

186 if tag == "a" and k == "target" and v.lower() == "_blank": 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 isBlankTarget = True

188 if styles: 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true

189 syleRes = {}

190 for s in styles:

191 style = s[: s.find(":")].strip()

192 value = s[s.find(":") + 1:].strip()

193 if any([c in style for c in filterChars]) or any(

194 [c in value for c in filterChars]):

195 # Either the key or the value contains a character that's not supposed to be there

196 continue

197 if value.lower().startswith("expression") or value.lower().startswith("import"):

198 # IE evaluates JS inside styles if the keyword expression is present

199 continue

200 if style in self.validHtml["validStyles"] and not any(

201 [(x in value) for x in ["\"", ":", ";"]]):

202 syleRes[style] = value

203 if len(syleRes.keys()):

204 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\""""

205 if classes: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 validClasses = []

207 for currentClass in classes:

208 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"

209 if not all([x in validClassChars for x in currentClass]):

210 # The class contains invalid characters

211 continue

212 isOkay = False

213 for validClass in self.validHtml["validClasses"]:

214 # Check if the classname matches or is white-listed by a prefix

215 if validClass == currentClass:

216 isOkay = True

217 break

218 if validClass.endswith("*"):

219 validClass = validClass[:-1]

220 if currentClass.startswith(validClass):

221 isOkay = True

222 break

223 if isOkay:

224 validClasses.append(currentClass)

225 if validClasses:

226 cacheTagStart += f""" class=\"{" ".join(validClasses)}\""""

227 if isBlankTarget: 227 ↛ 229line 227 didn't jump to line 229 because the condition on line 227 was never true

228 # Add rel tag to prevent the browser to pass window.opener around

229 cacheTagStart += " rel=\"noopener noreferrer\""

230 if tag in self.validHtml["singleTags"]:

231 # Single-Tags do have a visual representation; ensure it makes it into the result

232 self.flushCache()

233 self.result += cacheTagStart + '>' # dont need slash in void elements in html5

234 else:

235 # We opened a 'normal' tag; push it on the cache so it can be discarded later if

236 # we detect it has no content

237 cacheTagStart += '>'

238 self.tagCache.append((cacheTagStart, tag))

239 else:

240 self.result += " "

241

242 def handle_endtag(self, tag):

243 """

244 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones.

245

246 :param str tag: The current end tag encountered by the parser.

247 """

248 if self.validHtml: 248 ↛ exitline 248 didn't return from function 'handle_endtag' because the condition on line 248 was always true

249 if self.tagCache: 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was never true

250 # Check if that element is still on the cache

251 # and just silently drop the cache up to that point

252 if tag in [x[1] for x in self.tagCache] + self.openTagsList:

253 for tagCache in self.tagCache[::-1]:

254 self.tagCache.remove(tagCache)

255 if tagCache[1] == tag:

256 return

257 if tag in self.openTagsList:

258 # Close all currently open Tags until we reach the current one. If no one is found,

259 # we just close everything and ignore the tag that should have been closed

260 for endTag in self.openTagsList[:]: 260 ↛ exitline 260 didn't return from function 'handle_endtag' because the loop on line 260 didn't complete

261 self.result += f"</{endTag}>"

262 self.openTagsList.remove(endTag)

263 if endTag == tag:

264 break

265

266 def cleanup(self): # FIXME: vertauschte tags

267 """ Append missing closing tags to the result."""

268 self.flushCache()

269 for tag in self.openTagsList: 269 ↛ 270line 269 didn't jump to line 270 because the loop on line 269 never started

270 endTag = f'</{tag}>'

271 self.result += endTag

272

273 def sanitize(self, instr):

274 """

275 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones.

276

277 :param str instr: The input HTML string to be sanitized.

278 :return: The sanitized HTML string.

279 :rtype: str

280 """

281 self.result = ""

282 self.openTagsList = []

283 self.feed(instr)

284 self.close()

285 self.cleanup()

286 return self.result

287

288

289class TextBone(BaseBone):

290 """

291 A bone for storing and validating HTML or plain text content. Can be configured to allow

292 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of

293 srcset for embedded images.

294

295 :param validHtml: A dictionary containing allowed HTML tags and their attributes.

296 Defaults to `conf.bone_html_default_allow`.

297 :param max_length: The maximum allowed length for the content. Defaults to 200000.

298 :param languages: If set, this bone can store a different content for each language

299 :param srcSet: An optional dictionary containing width and height for srcset generation.

300 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}

301 :param indexed: Whether the content should be indexed for searching. Defaults to False.

302 :param kwargs: Additional keyword arguments to be passed to the base class constructor.

303 """

304

305 class __undefinedC__:

306 pass

307

308 type = "text"

309

310 def __init__(

311 self,

312 *,

313 validHtml: None | HtmlBoneConfiguration = __undefinedC__,

314 max_length: int = 200000,

315 srcSet: t.Optional[dict[str, list]] = None,

316 indexed: bool = False,

317 **kwargs

318 ):

319 """

320 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow`

321 :param languages: If set, this bone can store a different content for each language

322 :param max_length: Limit content to max_length bytes

323 :param indexed: Must not be set True, unless you limit max_length accordingly

324 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of

325 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}

326 """

327 # fixme: Remove in viur-core >= 4

328 if "maxLength" in kwargs: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true

329 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning)

330 max_length = kwargs.pop("maxLength")

331 super().__init__(indexed=indexed, **kwargs)

332

333 if validHtml == TextBone.__undefinedC__: 333 ↛ 336line 333 didn't jump to line 336 because the condition on line 333 was always true

334 validHtml = conf.bone_html_default_allow

335

336 self.validHtml = validHtml

337 self.max_length = max_length

338 self.srcSet = srcSet

339

340 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool):

341 """

342 Serializes a single value of the TextBone instance for storage.

343

344 This method takes the value as-is without any additional processing, since it's already stored in a format

345 suitable for serialization.

346 """

347 return value

348

349 def singleValueFromClient(self, value, skel, bone_name, client_data):

350 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise

351 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None

352 else:

353 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)]

354

355 def getEmptyValue(self):

356 """

357 Returns an empty value for the TextBone instance.

358

359 This method is used to represent an empty or unset value for the TextBone.

360

361 return: An empty string.

362 :rtype: str

363 """

364 return ""

365

366 def isInvalid(self, value):

367 """

368 Checks if the given value is valid for this TextBone instance.

369

370 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not

371 None and within the maximum length).

372

373 :param value: The value to be checked for validity.

374 :return: Returns None if the value is valid, or an error message string otherwise.

375 :rtype: Optional[str]

376 """

377

378 if value == None:

379 return "No value entered"

380 if len(value) > self.max_length: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true

381 return "Maximum length exceeded"

382

383 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:

384 """

385 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance.

386

387 This method parses the HTML content of the TextBone to identify embedded images or file hrefs,

388 collects their blob keys, and ensures that they are not deleted even if removed from the file browser,

389 preventing broken links or images in the TextBone content.

390

391 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.

392 :param str name: The name of the TextBone for which to find referenced blobs.

393 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content.

394 :rtype: Set[str]

395 """

396

397 collector = CollectBlobKeys()

398

399 for idx, lang, value in self.iter_bone_value(skel, name):

400 if value:

401 collector.feed(value)

402

403 blob_keys = collector.blobs

404

405 if blob_keys and self.srcSet:

406 derive_dict = {

407 "thumbnail": [

408 {"width": x} for x in (self.srcSet.get("width") or [])

409 ] + [

410 {"height": x} for x in (self.srcSet.get("height") or [])

411 ]

412 }

413 from viur.core.bones.file import ensureDerived

414 for blob_key in blob_keys:

415 file_obj = db.Query("file").filter("dlkey =", blob_key) \

416 .order(("creationdate", db.SortOrder.Ascending)).getEntry()

417 if file_obj:

418 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"])

419

420 return blob_keys

421

422 def refresh(self, skel, boneName) -> None:

423 """

424 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary.

425

426 This method is useful when the src-set configuration has changed and needs to be applied

427 to the existing HTML content. It re-parses the content and updates the src-set attributes

428 accordingly.

429

430 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.

431 :param str boneName: The name of the TextBone for which to refresh the src-set.

432 """

433 if self.srcSet:

434 val = skel[boneName]

435 if self.languages and isinstance(val, dict):

436 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()}

437 elif not self.languages and isinstance(val, str):

438 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0]

439

440 def getSearchTags(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:

441 """

442 Extracts search tags from the text content of a TextBone.

443

444 This method iterates over the values of the TextBone in the given skeleton, and for each non-empty value,

445 it tokenizes the text by lines and words. Then, it adds the lowercase version of each word to a set of

446 search tags, which is returned at the end.

447

448 :param skel: A SkeletonInstance containing the TextBone.

449 :param name: The name of the TextBone in the skeleton.

450 :return: A set of unique search tags (lowercase words) extracted from the text content of the TextBone.

451 """

452 result = set()

453 for idx, lang, value in self.iter_bone_value(skel, name):

454 if value is None:

455 continue

456 for line in str(value).splitlines():

457 for word in line.split(" "):

458 result.add(word.lower())

459 return result

460

461 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]:

462 """

463 Retrieves the unique property index values for the TextBone.

464

465 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear

466 whether each language should be kept distinct or not. Otherwise, it calls the superclass's

467 getUniquePropertyIndexValues method to retrieve the unique property index values.

468

469 :param valuesCache: A dictionary containing the cached values for the TextBone.

470 :param name: The name of the TextBone.

471 :return: A list of unique property index values for the TextBone.

472 :raises NotImplementedError: If the TextBone supports multiple languages.

473 """

474 if self.languages:

475 # Not yet implemented as it's unclear if we should keep each language distinct or not

476 raise NotImplementedError()

477

478 return super().getUniquePropertyIndexValues(valuesCache, name)

479

480 def structure(self) -> dict:

481 return super().structure() | {

482 "valid_html": self.validHtml,

483 }