From aefb27614fd1aec8ec21baaedacc083b57cd93ce Mon Sep 17 00:00:00 2001 From: Christian Morpurgo Date: Thu, 24 Apr 2025 17:39:41 +0200 Subject: [PATCH] fully make bs4 and soupsieve standalone in the project --- lib/bs4/builder/__init__.py | 2 +- lib/bs4/builder/_html5lib.py | 12 ++++++------ lib/bs4/builder/_htmlparser.py | 6 +++--- lib/bs4/builder/_lxml.py | 6 +++--- lib/bs4/diagnose.py | 6 +++--- lib/bs4/element.py | 6 +++--- lib/bs4/formatter.py | 2 +- lib/bs4/testing.py | 6 +++--- lib/bs4/tests/test_builder_registry.py | 8 ++++---- lib/bs4/tests/test_html5lib.py | 6 +++--- lib/bs4/tests/test_htmlparser.py | 6 +++--- lib/bs4/tests/test_lxml.py | 12 ++++++------ lib/bs4/tests/test_soup.py | 26 +++++++++++++------------- lib/bs4/tests/test_tree.py | 10 +++++----- lib/soupsieve/css_match.py | 17 +++++++++-------- markdown2html.py | 18 ++++++------------ 16 files changed, 72 insertions(+), 77 deletions(-) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 03fbd6a..71e34f0 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -4,7 +4,7 @@ __license__ = "MIT" from collections import defaultdict import itertools import sys -from bs4.element import ( +from ...bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, Stylesheet, diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 69aefd7..693248e 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -7,13 +7,13 @@ __all__ = [ import warnings import re -from bs4.builder import ( +from ...bs4.builder import ( PERMISSIVE, HTML, HTML_5, HTMLTreeBuilder, ) -from bs4.element import ( +from ...bs4.element import ( NamespacedAttribute, nonwhitespace_re, ) @@ -22,7 +22,7 @@ from html5lib.constants import ( namespaces, prefixes, ) -from bs4.element import ( +from ...bs4.element import ( Comment, Doctype, NavigableString, @@ -120,7 +120,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): if soup: self.soup = soup else: - from bs4 import BeautifulSoup + from ...bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup( @@ -166,7 +166,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): return TextNode(Comment(data), self.soup) def fragmentClass(self): - from bs4 import BeautifulSoup + from ...bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup("", "html.parser") @@ -184,7 +184,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): - from bs4 import BeautifulSoup + from ...bs4 import BeautifulSoup rv = [] doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index 88860a9..ab9cff1 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -34,16 +34,16 @@ CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 -from bs4.element import ( +from ...bs4.element import ( CData, Comment, Declaration, Doctype, ProcessingInstruction, ) -from bs4.dammit import EntitySubstitution, UnicodeDammit +from ...bs4.dammit import EntitySubstitution, UnicodeDammit -from bs4.builder import ( +from ...bs4.builder import ( HTML, HTMLTreeBuilder, STRICT, diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index 432a2c8..5fc19d1 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -14,14 +14,14 @@ except ImportError as e: from io import BytesIO from io import StringIO from lxml import etree -from bs4.element import ( +from ...bs4.element import ( Comment, Doctype, NamespacedAttribute, ProcessingInstruction, XMLProcessingInstruction, ) -from bs4.builder import ( +from ...bs4.builder import ( FAST, HTML, HTMLTreeBuilder, @@ -29,7 +29,7 @@ from bs4.builder import ( ParserRejectedMarkup, TreeBuilder, XML) -from bs4.dammit import EncodingDetector +from ...bs4.dammit import EncodingDetector LXML = 'lxml' diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index 500e92d..d5c6fb9 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -6,9 +6,9 @@ __license__ = "MIT" import cProfile from io import StringIO from html.parser import HTMLParser -import bs4 -from bs4 import BeautifulSoup, __version__ -from bs4.builder import builder_registry +from ..bs4 import BeautifulSoup as bs4 +from ..bs4 import BeautifulSoup, __version__ +from ..bs4.builder import builder_registry import os import pstats diff --git a/lib/bs4/element.py b/lib/bs4/element.py index 81d9db9..cafd4ad 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -9,14 +9,14 @@ import re import sys import warnings try: - import soupsieve + from ..soupsieve import * except ImportError as e: soupsieve = None warnings.warn( 'The soupsieve package is not installed. CSS selectors cannot be used.' ) -from bs4.formatter import ( +from ..bs4.formatter import ( Formatter, HTMLFormatter, XMLFormatter, @@ -380,7 +380,7 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) - from bs4 import BeautifulSoup + from ..bs4 import BeautifulSoup if isinstance(new_child, BeautifulSoup): # We don't want to end up with a situation where one BeautifulSoup # object contains another. Insert the children one at a time. diff --git a/lib/bs4/formatter.py b/lib/bs4/formatter.py index 2cbab4c..236443a 100644 --- a/lib/bs4/formatter.py +++ b/lib/bs4/formatter.py @@ -1,4 +1,4 @@ -from bs4.dammit import EntitySubstitution +from ..bs4.dammit import EntitySubstitution class Formatter(EntitySubstitution): """Describes a strategy to use when outputting a parse tree to a string. diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py index 9ca507b..20f6c19 100644 --- a/lib/bs4/testing.py +++ b/lib/bs4/testing.py @@ -9,8 +9,8 @@ import copy import functools import unittest from unittest import TestCase -from bs4 import BeautifulSoup -from bs4.element import ( +from ..bs4 import BeautifulSoup +from ..bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, @@ -22,7 +22,7 @@ from bs4.element import ( Tag ) -from bs4.builder import HTMLParserTreeBuilder +from ..bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder BAD_DOCUMENT = """A bare string diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py index 90cad82..2358cd4 100644 --- a/lib/bs4/tests/test_builder_registry.py +++ b/lib/bs4/tests/test_builder_registry.py @@ -3,21 +3,21 @@ import unittest import warnings -from bs4 import BeautifulSoup -from bs4.builder import ( +from ...bs4 import BeautifulSoup +from ...bs4.builder import ( builder_registry as registry, HTMLParserTreeBuilder, TreeBuilderRegistry, ) try: - from bs4.builder import HTML5TreeBuilder + from ...bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError: HTML5LIB_PRESENT = False try: - from bs4.builder import ( + from ...bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, ) diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py index b77659b..fc9ef5c 100644 --- a/lib/bs4/tests/test_html5lib.py +++ b/lib/bs4/tests/test_html5lib.py @@ -3,12 +3,12 @@ import warnings try: - from bs4.builder import HTML5TreeBuilder + from ...bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError as e: HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( +from ...bs4.element import SoupStrainer +from ...bs4.testing import ( HTML5TreeBuilderSmokeTest, SoupTest, skipIf, diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py index aeff094..27064bc 100644 --- a/lib/bs4/tests/test_htmlparser.py +++ b/lib/bs4/tests/test_htmlparser.py @@ -3,9 +3,9 @@ trees.""" from pdb import set_trace import pickle -from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest -from bs4.builder import HTMLParserTreeBuilder -from bs4.builder._htmlparser import BeautifulSoupHTMLParser +from ...bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest +from ...bs4.builder import HTMLParserTreeBuilder +from ...bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py index 3d0c75f..700fdbd 100644 --- a/lib/bs4/tests/test_lxml.py +++ b/lib/bs4/tests/test_lxml.py @@ -12,16 +12,16 @@ except ImportError as e: LXML_VERSION = (0,) if LXML_PRESENT: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + from ...bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML -from bs4 import ( +from ...bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) -from bs4.element import Comment, Doctype, SoupStrainer -from bs4.testing import skipIf -from bs4.tests import test_htmlparser -from bs4.testing import ( +from ...bs4.element import Comment, Doctype, SoupStrainer +from ...bs4.testing import skipIf +from ...bs4.tests import test_htmlparser +from ...bs4.testing import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SoupTest, diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py index 1ba3feb..9ec6f14 100644 --- a/lib/bs4/tests/test_soup.py +++ b/lib/bs4/tests/test_soup.py @@ -7,17 +7,17 @@ import unittest import sys import tempfile -from bs4 import ( +from ...bs4 import ( BeautifulSoup, BeautifulStoneSoup, GuessedAtParserWarning, MarkupResemblesLocatorWarning, ) -from bs4.builder import ( +from ...bs4.builder import ( TreeBuilder, ParserRejectedMarkup, ) -from bs4.element import ( +from ...bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, @@ -27,13 +27,13 @@ from bs4.element import ( NavigableString, ) -import bs4.dammit -from bs4.dammit import ( +from ...bs4.dammit import * +from ...bs4.dammit import ( EntitySubstitution, UnicodeDammit, EncodingDetector, ) -from bs4.testing import ( +from ...bs4.testing import ( default_builder, SoupTest, skipIf, @@ -41,7 +41,7 @@ from bs4.testing import ( import warnings try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + from ...bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False @@ -418,13 +418,13 @@ class TestEncodingConversion(SoupTest): def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. - chardet = bs4.dammit.chardet_dammit + chardet = chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. - bs4.dammit.chardet_dammit = noop + chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() @@ -433,7 +433,7 @@ class TestEncodingConversion(SoupTest): self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet + chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute @@ -574,12 +574,12 @@ class TestUnicodeDammit(unittest.TestCase): doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" - chardet = bs4.dammit.chardet_dammit + chardet = chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None - bs4.dammit.chardet_dammit = noop + chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) @@ -588,7 +588,7 @@ class TestUnicodeDammit(unittest.TestCase): self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet + chardet_dammit = chardet def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py index 02ef73f..f72d199 100644 --- a/lib/bs4/tests/test_tree.py +++ b/lib/bs4/tests/test_tree.py @@ -14,12 +14,12 @@ import copy import pickle import re import warnings -from bs4 import BeautifulSoup -from bs4.builder import ( +from ...bs4 import BeautifulSoup +from ...bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) -from bs4.element import ( +from ...bs4.element import ( PY3K, CData, Comment, @@ -33,11 +33,11 @@ from bs4.element import ( Tag, TemplateString, ) -from bs4.testing import ( +from ...bs4.testing import ( SoupTest, skipIf, ) -from soupsieve import SelectorSyntaxError +from ...soupsieve import SelectorSyntaxError XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) diff --git a/lib/soupsieve/css_match.py b/lib/soupsieve/css_match.py index a9eeaad..ca00c07 100644 --- a/lib/soupsieve/css_match.py +++ b/lib/soupsieve/css_match.py @@ -6,7 +6,8 @@ from .import css_types as ct import unicodedata from collections.abc import Sequence -import bs4 +from ..bs4 import * +from ..bs4.element import * # Empty tag pattern (whitespace okay) RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') @@ -90,37 +91,37 @@ class _DocumentNav(object): @staticmethod def is_doc(obj): """Is `BeautifulSoup` object.""" - return isinstance(obj, bs4.BeautifulSoup) + return isinstance(obj, BeautifulSoup) @staticmethod def is_tag(obj): """Is tag.""" - return isinstance(obj, bs4.Tag) + return isinstance(obj, Tag) @staticmethod def is_declaration(obj): # pragma: no cover """Is declaration.""" - return isinstance(obj, bs4.Declaration) + return isinstance(obj, Declaration) @staticmethod def is_cdata(obj): """Is CDATA.""" - return isinstance(obj, bs4.CData) + return isinstance(obj, CData) @staticmethod def is_processing_instruction(obj): # pragma: no cover """Is processing instruction.""" - return isinstance(obj, bs4.ProcessingInstruction) + return isinstance(obj, ProcessingInstruction) @staticmethod def is_navigable_string(obj): """Is navigable string.""" - return isinstance(obj, bs4.NavigableString) + return isinstance(obj, NavigableString) @staticmethod def is_special_string(obj): """Is special string.""" - return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) + return isinstance(obj, (Comment, Declaration, CData, ProcessingInstruction, Doctype)) @classmethod def is_content_string(cls, obj): diff --git a/markdown2html.py b/markdown2html.py index 73c5d4f..213f560 100644 --- a/markdown2html.py +++ b/markdown2html.py @@ -11,18 +11,14 @@ import os.path import concurrent.futures import urllib.request import base64 - -# --- Bundled library imports --- -# Explicitly import from the 'lib' directory, now that the package root is in sys.path -from lib import bs4 -from lib.markdown2 import Markdown -# --- End bundled library imports --- +from .lib.bs4 import BeautifulSoup as bs4 from functools import partial +from .lib.markdown2 import Markdown + __all__ = ("markdown2html",) -# Use the imported module name markdowner = Markdown(extras=["fenced-code-blocks", "cuddled-lists"]) # FIXME: how do I choose how many workers I want? Does thread pool reuse threads or @@ -37,7 +33,6 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font """ html = markdowner.convert(markdown) - # Use the imported module name soup = bs4.BeautifulSoup(html, "html.parser") for img_element in soup.find_all("img"): src = img_element["src"] @@ -57,15 +52,14 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font # realpath: simplify that paths so that we don't have duplicated caches path = os.path.realpath(os.path.expanduser(os.path.join(basepath, src))) - base64_img, (width, height) = get_base64_image(path, re_render, resources) # Renamed local var to avoid conflict + base64, (width, height) = get_base64_image(path, re_render, resources) - img_element["src"] = base64_img + img_element["src"] = base64 if width > viewport_width: img_element["width"] = viewport_width img_element["height"] = viewport_width * (height / width) # remove comments, because they pollute the console with error messages - # Use the imported module name for comment_element in soup.find_all( text=lambda text: isinstance(text, bs4.Comment) ): @@ -84,7 +78,7 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font .replace(" ", '.') .replace("\n", "
") ) - # Use the imported module name + code_element.replace_with(bs4.BeautifulSoup(fixed_pre, "html.parser")) # FIXME: highlight the code using Sublime's syntax