| 
                         #!/usr/bin/env python # -*-coding:utf-8-*-   import re     def regular_clean(self, str1: str, str2: str):     '''     正则表达式处理数据格式     :param str1: content     :param str2: html_content     :return: 返回处理后的结果     '''       def new_line(text):       text = re.sub('<brs?/?>', '<br>', text)       text = re.sub(         '</?a>|</?em>|</?html>|</?body>|'         '</?head>|<[a-zA-Z]{1,10}s?/>|'         '</?strong>|</?blockquote>|</?b>|'         '</?span>|</?i>|</?hr>|</?font>',         '',         text)       text = re.sub('n', '', text)       text = re.sub('<h[1-6]>', '<p>', text)       text = re.sub('</h[1-6]>', '</p>', text)       text = text.replace('</p>', '</p>n').replace('<br>', '<br/>')       return text       str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题       # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换       str2 = new_line(text=str2)       return str1, str2 
结尾部分,各个方法封装类代码展示 
#!/usr/bin/env python # -*-coding:utf-8-*- ''' author: szhan date:2020-08-17 summery: 清理html_conent以及获取纯净数据格式 '''   import re from lxml import etree from pyquery import PyQuery as pq from urllib.parse import urlsplit, urljoin   from loguru import logger     class CleanArticle:     def __init__(       self,       text: str,       url: str = '',       xpath_dict: dict = None,       pq_dict: dict = None   ):     self.text = text     self.url = url     self.xpath_dict = xpath_dict or dict()     self.pq_dict = pq_dict or dict()     @staticmethod   def absolute_url(baseurl: str, url: str) -> str:     '''     补充url     :param baseurl:scheme url     :param url: target url     :return: complete url     '''     target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)     return target_url     @staticmethod   def clean_blank(text):     '''     空白处理     :param text:     :return:     '''     text = text.replace('
', '').replace('u3000', '').replace('t', '').replace('xa0', '')     text = re.sub('s{2,}', '', text)     text = re.sub('n{2,}', 'n', text)     text = text.strip('n').strip()     return text     def run(self):     '''     :return:处理后的content, html_content     '''     if (not bool(self.text)) or (not isinstance(self.text, str)):       raise ValueError('html_content has a bad type value')     # 首先,使用xpath去除空格,以及注释,iframe, button, form, script, style, video等标签     text = self.xpath_clean(self.text, self.xpath_dict)       # 第二步,使用pyquery处理具体细节方面     str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)       # 最终的正则处理     content, html_content = self.regular_clean(str1, str2)       return content, html_content     def xpath_clean(self, text: str, xpath_dict: dict) -> str:     '''     xpath 清除不必要的元素     :param text: html_content     :param xpath_dict: 清除目标xpath     :return: string type html_content     '''     remove_by_xpath = xpath_dict if xpath_dict else dict()       # 必然清除的项目 除非极端情况 一般这些都是要清除的     remove_by_xpath.update({       '_remove_2': '//iframe',       '_remove_4': '//button',       '_remove_5': '//form',       '_remove_6': '//input',       '_remove_7': '//select',       '_remove_8': '//option',       '_remove_9': '//textarea',       '_remove_10': '//figure',       '_remove_11': '//figcaption',       '_remove_12': '//frame',       '_remove_13': '//video',       '_remove_14': '//script',       '_remove_15': '//style'     })       parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)     selector = etree.HTML(text, parser=parser)       # 常规删除操作,不需要的标签删除     for xpath in remove_by_xpath.values():       for bad in selector.xpath(xpath):         bad_string = etree.tostring(bad, encoding='utf-8',                       pretty_print=True).decode()         logger.debug(f"clean article content : {bad_string}")         bad.getparent().remove(bad)       skip_tip = "name()='img' or name()='tr' or "            "name()='th' or name()='tbody' or "            "name()='thead' or name()='table'"     # 判断所有p标签,是否有内容存在,没有的直接删除     for p in selector.xpath(f"//*[not({skip_tip})]"):       # 跳过逻辑       if p.xpath(f".//*[{skip_tip}]") or            bool(re.sub('s', '', p.xpath('string(.)'))):         continue         bad_p = etree.tostring(p, encoding='utf-8',                   pretty_print=True).decode()       logger.debug(f"clean p tag : {bad_p}")       p.getparent().remove(p)       return etree.tostring(selector, encoding='utf-8',                pretty_print=True).decode()     def pyquery_clean(self, text, url, pq_dict) -> object:     '''     pyquery 做出必要的处理,     :param text:     :param url:     :param pq_dict:     :return:     '''     # 删除pq表达式字典     remove_by_pq = pq_dict if pq_dict else dict()     # 标签属性白名单     attr_white_list = ['rowspan', 'colspan']     # 图片链接key     img_key_list = ['src', 'data-echo', 'data-src', 'data-original']     # 生成pyquery对象     dom = pq(text)       # 删除无用标签     for bad_tag in remove_by_pq.values():       for bad in dom(bad_tag):         bad_string = pq(bad).html()         logger.debug(f"clean article content : {bad_string}")       dom.remove(bad_tag)       # 标签各个属性处理     for tag in dom('*'):       for key, value in tag.attrib.items():         # 跳过逻辑,保留表格的rowspan和colspan属性         if key in attr_white_list:           continue         # 处理图片链接,不完整url,补充完整后替换         if key in img_key_list:           img_url = self.absolute_url(url, value)           pq(tag).remove_attr(key)           pq(tag).attr('src', img_url)           pq(tag).attr('alt', '')         # img标签的alt属性保留为空         elif key == 'alt':           pq(tag).attr(key, '')         # 其余所有属性做删除操作         else:           pq(tag).remove_attr(key)       return dom.text(), dom.html()     def regular_clean(self, str1: str, str2: str):     '''     正则表达式处理数据格式     :param str1: content     :param str2: html_content     :return: 返回处理后的结果     '''       def new_line(text):       text = re.sub('<brs?/?>', '<br>', text)       text = re.sub(         '</?a>|</?em>|</?html>|</?body>|'         '</?head>|<[a-zA-Z]{1,10}s?/>|'         '</?strong>|</?blockquote>|</?b>|'         '</?span>|</?i>|</?hr>|</?font>',         '',         text)       text = re.sub('n', '', text)       text = re.sub('<h[1-6]>', '<p>', text)       text = re.sub('</h[1-6]>', '</p>', text)       text = text.replace('</p>', '</p>n').replace('<br>', '<br/>')       return text       str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题       # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换       str2 = new_line(text=str2)       return str1, str2   if __name__ == '__main__':   with open('html_content.html', 'r', encoding='utf-8') as f:     lines = f.readlines()     html = ''     for line in lines:       html += line   ca = CleanArticle(text=html)   _, html_content = ca.run()   print(html_content) 
总结                         (编辑:52站长网) 
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! 
                     |