fully make bs4 and soupsieve standalone in the project

This commit is contained in:
2025-04-24 17:39:41 +02:00
parent ed336866ee
commit aefb27614f
16 changed files with 72 additions and 77 deletions

View File

@ -4,7 +4,7 @@ __license__ = "MIT"
from collections import defaultdict from collections import defaultdict
import itertools import itertools
import sys import sys
from bs4.element import ( from ...bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
Stylesheet, Stylesheet,

View File

@ -7,13 +7,13 @@ __all__ = [
import warnings import warnings
import re import re
from bs4.builder import ( from ...bs4.builder import (
PERMISSIVE, PERMISSIVE,
HTML, HTML,
HTML_5, HTML_5,
HTMLTreeBuilder, HTMLTreeBuilder,
) )
from bs4.element import ( from ...bs4.element import (
NamespacedAttribute, NamespacedAttribute,
nonwhitespace_re, nonwhitespace_re,
) )
@ -22,7 +22,7 @@ from html5lib.constants import (
namespaces, namespaces,
prefixes, prefixes,
) )
from bs4.element import ( from ...bs4.element import (
Comment, Comment,
Doctype, Doctype,
NavigableString, NavigableString,
@ -120,7 +120,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
if soup: if soup:
self.soup = soup self.soup = soup
else: else:
from bs4 import BeautifulSoup from ...bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an # TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop? # infinite loop?
self.soup = BeautifulSoup( self.soup = BeautifulSoup(
@ -166,7 +166,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
return TextNode(Comment(data), self.soup) return TextNode(Comment(data), self.soup)
def fragmentClass(self): def fragmentClass(self):
from bs4 import BeautifulSoup from ...bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an # TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop? # infinite loop?
self.soup = BeautifulSoup("", "html.parser") self.soup = BeautifulSoup("", "html.parser")
@ -184,7 +184,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
return treebuilder_base.TreeBuilder.getFragment(self).element return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element): def testSerializer(self, element):
from bs4 import BeautifulSoup from ...bs4 import BeautifulSoup
rv = [] rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

View File

@ -34,16 +34,16 @@ CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import ( from ...bs4.element import (
CData, CData,
Comment, Comment,
Declaration, Declaration,
Doctype, Doctype,
ProcessingInstruction, ProcessingInstruction,
) )
from bs4.dammit import EntitySubstitution, UnicodeDammit from ...bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import ( from ...bs4.builder import (
HTML, HTML,
HTMLTreeBuilder, HTMLTreeBuilder,
STRICT, STRICT,

View File

@ -14,14 +14,14 @@ except ImportError as e:
from io import BytesIO from io import BytesIO
from io import StringIO from io import StringIO
from lxml import etree from lxml import etree
from bs4.element import ( from ...bs4.element import (
Comment, Comment,
Doctype, Doctype,
NamespacedAttribute, NamespacedAttribute,
ProcessingInstruction, ProcessingInstruction,
XMLProcessingInstruction, XMLProcessingInstruction,
) )
from bs4.builder import ( from ...bs4.builder import (
FAST, FAST,
HTML, HTML,
HTMLTreeBuilder, HTMLTreeBuilder,
@ -29,7 +29,7 @@ from bs4.builder import (
ParserRejectedMarkup, ParserRejectedMarkup,
TreeBuilder, TreeBuilder,
XML) XML)
from bs4.dammit import EncodingDetector from ...bs4.dammit import EncodingDetector
LXML = 'lxml' LXML = 'lxml'

View File

@ -6,9 +6,9 @@ __license__ = "MIT"
import cProfile import cProfile
from io import StringIO from io import StringIO
from html.parser import HTMLParser from html.parser import HTMLParser
import bs4 from ..bs4 import BeautifulSoup as bs4
from bs4 import BeautifulSoup, __version__ from ..bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from ..bs4.builder import builder_registry
import os import os
import pstats import pstats

View File

@ -9,14 +9,14 @@ import re
import sys import sys
import warnings import warnings
try: try:
import soupsieve from ..soupsieve import *
except ImportError as e: except ImportError as e:
soupsieve = None soupsieve = None
warnings.warn( warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.' 'The soupsieve package is not installed. CSS selectors cannot be used.'
) )
from bs4.formatter import ( from ..bs4.formatter import (
Formatter, Formatter,
HTMLFormatter, HTMLFormatter,
XMLFormatter, XMLFormatter,
@ -380,7 +380,7 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)): and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child) new_child = NavigableString(new_child)
from bs4 import BeautifulSoup from ..bs4 import BeautifulSoup
if isinstance(new_child, BeautifulSoup): if isinstance(new_child, BeautifulSoup):
# We don't want to end up with a situation where one BeautifulSoup # We don't want to end up with a situation where one BeautifulSoup
# object contains another. Insert the children one at a time. # object contains another. Insert the children one at a time.

View File

@ -1,4 +1,4 @@
from bs4.dammit import EntitySubstitution from ..bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution): class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string. """Describes a strategy to use when outputting a parse tree to a string.

View File

@ -9,8 +9,8 @@ import copy
import functools import functools
import unittest import unittest
from unittest import TestCase from unittest import TestCase
from bs4 import BeautifulSoup from ..bs4 import BeautifulSoup
from bs4.element import ( from ..bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
Comment, Comment,
ContentMetaAttributeValue, ContentMetaAttributeValue,
@ -22,7 +22,7 @@ from bs4.element import (
Tag Tag
) )
from bs4.builder import HTMLParserTreeBuilder from ..bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string BAD_DOCUMENT = """A bare string

View File

@ -3,21 +3,21 @@
import unittest import unittest
import warnings import warnings
from bs4 import BeautifulSoup from ...bs4 import BeautifulSoup
from bs4.builder import ( from ...bs4.builder import (
builder_registry as registry, builder_registry as registry,
HTMLParserTreeBuilder, HTMLParserTreeBuilder,
TreeBuilderRegistry, TreeBuilderRegistry,
) )
try: try:
from bs4.builder import HTML5TreeBuilder from ...bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True HTML5LIB_PRESENT = True
except ImportError: except ImportError:
HTML5LIB_PRESENT = False HTML5LIB_PRESENT = False
try: try:
from bs4.builder import ( from ...bs4.builder import (
LXMLTreeBuilderForXML, LXMLTreeBuilderForXML,
LXMLTreeBuilder, LXMLTreeBuilder,
) )

View File

@ -3,12 +3,12 @@
import warnings import warnings
try: try:
from bs4.builder import HTML5TreeBuilder from ...bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True HTML5LIB_PRESENT = True
except ImportError as e: except ImportError as e:
HTML5LIB_PRESENT = False HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer from ...bs4.element import SoupStrainer
from bs4.testing import ( from ...bs4.testing import (
HTML5TreeBuilderSmokeTest, HTML5TreeBuilderSmokeTest,
SoupTest, SoupTest,
skipIf, skipIf,

View File

@ -3,9 +3,9 @@ trees."""
from pdb import set_trace from pdb import set_trace
import pickle import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from ...bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder from ...bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser from ...bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):

View File

@ -12,16 +12,16 @@ except ImportError as e:
LXML_VERSION = (0,) LXML_VERSION = (0,)
if LXML_PRESENT: if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from ...bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
from bs4 import ( from ...bs4 import (
BeautifulSoup, BeautifulSoup,
BeautifulStoneSoup, BeautifulStoneSoup,
) )
from bs4.element import Comment, Doctype, SoupStrainer from ...bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf from ...bs4.testing import skipIf
from bs4.tests import test_htmlparser from ...bs4.tests import test_htmlparser
from bs4.testing import ( from ...bs4.testing import (
HTMLTreeBuilderSmokeTest, HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest,
SoupTest, SoupTest,

View File

@ -7,17 +7,17 @@ import unittest
import sys import sys
import tempfile import tempfile
from bs4 import ( from ...bs4 import (
BeautifulSoup, BeautifulSoup,
BeautifulStoneSoup, BeautifulStoneSoup,
GuessedAtParserWarning, GuessedAtParserWarning,
MarkupResemblesLocatorWarning, MarkupResemblesLocatorWarning,
) )
from bs4.builder import ( from ...bs4.builder import (
TreeBuilder, TreeBuilder,
ParserRejectedMarkup, ParserRejectedMarkup,
) )
from bs4.element import ( from ...bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
Comment, Comment,
ContentMetaAttributeValue, ContentMetaAttributeValue,
@ -27,13 +27,13 @@ from bs4.element import (
NavigableString, NavigableString,
) )
import bs4.dammit from ...bs4.dammit import *
from bs4.dammit import ( from ...bs4.dammit import (
EntitySubstitution, EntitySubstitution,
UnicodeDammit, UnicodeDammit,
EncodingDetector, EncodingDetector,
) )
from bs4.testing import ( from ...bs4.testing import (
default_builder, default_builder,
SoupTest, SoupTest,
skipIf, skipIf,
@ -41,7 +41,7 @@ from bs4.testing import (
import warnings import warnings
try: try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from ...bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True LXML_PRESENT = True
except ImportError as e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
@ -418,13 +418,13 @@ class TestEncodingConversion(SoupTest):
def test_ascii_in_unicode_out(self): def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding # ASCII input is converted to Unicode. The original_encoding
# attribute is set to 'utf-8', a superset of ASCII. # attribute is set to 'utf-8', a superset of ASCII.
chardet = bs4.dammit.chardet_dammit chardet = chardet_dammit
logging.disable(logging.WARNING) logging.disable(logging.WARNING)
try: try:
def noop(str): def noop(str):
return None return None
# Disable chardet, which will realize that the ASCII is ASCII. # Disable chardet, which will realize that the ASCII is ASCII.
bs4.dammit.chardet_dammit = noop chardet_dammit = noop
ascii = b"<foo>a</foo>" ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii) soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode() unicode_output = soup_from_ascii.decode()
@ -433,7 +433,7 @@ class TestEncodingConversion(SoupTest):
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally: finally:
logging.disable(logging.NOTSET) logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet chardet_dammit = chardet
def test_unicode_in_unicode_out(self): def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute # Unicode input is left alone. The original_encoding attribute
@ -574,12 +574,12 @@ class TestUnicodeDammit(unittest.TestCase):
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b> <html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>""" <i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit chardet = chardet_dammit
logging.disable(logging.WARNING) logging.disable(logging.WARNING)
try: try:
def noop(str): def noop(str):
return None return None
bs4.dammit.chardet_dammit = noop chardet_dammit = noop
dammit = UnicodeDammit(doc) dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters) self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue("\ufffd" in dammit.unicode_markup) self.assertTrue("\ufffd" in dammit.unicode_markup)
@ -588,7 +588,7 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertTrue(soup.contains_replacement_characters) self.assertTrue(soup.contains_replacement_characters)
finally: finally:
logging.disable(logging.NOTSET) logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet chardet_dammit = chardet
def test_byte_order_mark_removed(self): def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped. # A document written in UTF-16LE will have its byte order marker stripped.

View File

@ -14,12 +14,12 @@ import copy
import pickle import pickle
import re import re
import warnings import warnings
from bs4 import BeautifulSoup from ...bs4 import BeautifulSoup
from bs4.builder import ( from ...bs4.builder import (
builder_registry, builder_registry,
HTMLParserTreeBuilder, HTMLParserTreeBuilder,
) )
from bs4.element import ( from ...bs4.element import (
PY3K, PY3K,
CData, CData,
Comment, Comment,
@ -33,11 +33,11 @@ from bs4.element import (
Tag, Tag,
TemplateString, TemplateString,
) )
from bs4.testing import ( from ...bs4.testing import (
SoupTest, SoupTest,
skipIf, skipIf,
) )
from soupsieve import SelectorSyntaxError from ...soupsieve import SelectorSyntaxError
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None)

View File

@ -6,7 +6,8 @@ from .import css_types as ct
import unicodedata import unicodedata
from collections.abc import Sequence from collections.abc import Sequence
import bs4 from ..bs4 import *
from ..bs4.element import *
# Empty tag pattern (whitespace okay) # Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -90,37 +91,37 @@ class _DocumentNav(object):
@staticmethod @staticmethod
def is_doc(obj): def is_doc(obj):
"""Is `BeautifulSoup` object.""" """Is `BeautifulSoup` object."""
return isinstance(obj, bs4.BeautifulSoup) return isinstance(obj, BeautifulSoup)
@staticmethod @staticmethod
def is_tag(obj): def is_tag(obj):
"""Is tag.""" """Is tag."""
return isinstance(obj, bs4.Tag) return isinstance(obj, Tag)
@staticmethod @staticmethod
def is_declaration(obj): # pragma: no cover def is_declaration(obj): # pragma: no cover
"""Is declaration.""" """Is declaration."""
return isinstance(obj, bs4.Declaration) return isinstance(obj, Declaration)
@staticmethod @staticmethod
def is_cdata(obj): def is_cdata(obj):
"""Is CDATA.""" """Is CDATA."""
return isinstance(obj, bs4.CData) return isinstance(obj, CData)
@staticmethod @staticmethod
def is_processing_instruction(obj): # pragma: no cover def is_processing_instruction(obj): # pragma: no cover
"""Is processing instruction.""" """Is processing instruction."""
return isinstance(obj, bs4.ProcessingInstruction) return isinstance(obj, ProcessingInstruction)
@staticmethod @staticmethod
def is_navigable_string(obj): def is_navigable_string(obj):
"""Is navigable string.""" """Is navigable string."""
return isinstance(obj, bs4.NavigableString) return isinstance(obj, NavigableString)
@staticmethod @staticmethod
def is_special_string(obj): def is_special_string(obj):
"""Is special string.""" """Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) return isinstance(obj, (Comment, Declaration, CData, ProcessingInstruction, Doctype))
@classmethod @classmethod
def is_content_string(cls, obj): def is_content_string(cls, obj):

View File

@ -11,18 +11,14 @@ import os.path
import concurrent.futures import concurrent.futures
import urllib.request import urllib.request
import base64 import base64
from .lib.bs4 import BeautifulSoup as bs4
# --- Bundled library imports ---
# Explicitly import from the 'lib' directory, now that the package root is in sys.path
from lib import bs4
from lib.markdown2 import Markdown
# --- End bundled library imports ---
from functools import partial from functools import partial
from .lib.markdown2 import Markdown
__all__ = ("markdown2html",) __all__ = ("markdown2html",)
# Use the imported module name
markdowner = Markdown(extras=["fenced-code-blocks", "cuddled-lists"]) markdowner = Markdown(extras=["fenced-code-blocks", "cuddled-lists"])
# FIXME: how do I choose how many workers I want? Does thread pool reuse threads or # FIXME: how do I choose how many workers I want? Does thread pool reuse threads or
@ -37,7 +33,6 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font
""" """
html = markdowner.convert(markdown) html = markdowner.convert(markdown)
# Use the imported module name
soup = bs4.BeautifulSoup(html, "html.parser") soup = bs4.BeautifulSoup(html, "html.parser")
for img_element in soup.find_all("img"): for img_element in soup.find_all("img"):
src = img_element["src"] src = img_element["src"]
@ -57,15 +52,14 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font
# realpath: simplify that paths so that we don't have duplicated caches # realpath: simplify that paths so that we don't have duplicated caches
path = os.path.realpath(os.path.expanduser(os.path.join(basepath, src))) path = os.path.realpath(os.path.expanduser(os.path.join(basepath, src)))
base64_img, (width, height) = get_base64_image(path, re_render, resources) # Renamed local var to avoid conflict base64, (width, height) = get_base64_image(path, re_render, resources)
img_element["src"] = base64_img img_element["src"] = base64
if width > viewport_width: if width > viewport_width:
img_element["width"] = viewport_width img_element["width"] = viewport_width
img_element["height"] = viewport_width * (height / width) img_element["height"] = viewport_width * (height / width)
# remove comments, because they pollute the console with error messages # remove comments, because they pollute the console with error messages
# Use the imported module name
for comment_element in soup.find_all( for comment_element in soup.find_all(
text=lambda text: isinstance(text, bs4.Comment) text=lambda text: isinstance(text, bs4.Comment)
): ):
@ -84,7 +78,7 @@ def markdown2html(markdown, basepath, re_render, resources, viewport_width, font
.replace(" ", '<i class="space">.</i>') .replace(" ", '<i class="space">.</i>')
.replace("\n", "<br />") .replace("\n", "<br />")
) )
# Use the imported module name
code_element.replace_with(bs4.BeautifulSoup(fixed_pre, "html.parser")) code_element.replace_with(bs4.BeautifulSoup(fixed_pre, "html.parser"))
# FIXME: highlight the code using Sublime's syntax # FIXME: highlight the code using Sublime's syntax