https://www.crummy.com/software/BeautifulSoup/bs4/doc/
pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_text)
links = [element.get('href') for element in soup.find_all('a')]
beautifulsoup.
BeautifulSoup
(html_text: str, parser: str = 'html.parser')Парсер html
soup = BeautifulSoup(some_html_string)
soup = BeautifulSoup(some_html_string, 'html.parser')
soup = BeautifulSoup(some_html_string, 'lxml')
soup = BeautifulSoup(some_html_string, 'lxml-xml')
soup = BeautifulSoup(some_html_string, 'html5lib')
body
Возвращает beautifulsoup.element.Tag
head
Возвращает beautifulsoup.element.Tag
title
Возвращает beautifulsoup.element.Tag
get_test
()Возвращает строку, весь текст, без html страницы
find
(name=None, attributes={}, recursive=True, text=None, *kwargs)name = None
attributes = {}
recursive = True
text = None
id
string
Возвращает первый найденный элемент, beautifulsoup.element.Tag
elem = soup.find(id='myId')
elem = soup.find('h2', string='Python')
elem = soup.find('h2', string=lambda text: 'Python' in text)
findAll
(name=None, attributes={}, recursive=True, text=None, limit=None, *kwargs) → :py:class:`beautifulsoup.element.ResultSet`Поиск элементов на странице
span_list = bs_obj.findAll('span', {'class': 'green'})
for span in span_list:
print(span.get_text())
hs = bs_obj.findAll({'h1', 'h2', 'h3', 'h4', 'h5', 'h6'})
id_text_elem = bs_obj.findAll(id='text')
imgs = bs_obj.findAll('img', {'src': re.compile('\.\.\/img\/*\.jpg')})
imgs = bs_obj.findAll(lambda tag: len(tag.attrs) == 2)
prettify
() → strВозвращает строку, отформатированныую строку содержимого
print(soup.prettify())
beautifulsoup.
Comment
Коментарии