3.2转换pdf时注意事项

提取正文主体后，可以直接将
保存html，不需要补全
图片链接相对路径需要转换为绝对路径，才会自动加载图片
pdfkit.from_file 第一个参数 input 为 html文件路径列表，文件名不能是中文。。。
- pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
pdf会根据

等标题 tag 自动生成目录
4.实践代码

#!usr/bin/env python
#coding:utf-8

import os
import sys
import traceback
import re
import urlparse
import threading
import Queue

import requests
from scrapy import Selector
import pdfkit


s = requests.Session()
# s.headers.update({‘user-agent‘:‘Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G‘})
s.headers.update({‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘})
# s.headers.update({‘Referer‘:‘https://servicewechat.com/wx55b926152a8c3bef/14/page-frame.html‘})
s.verify = False
s.mount(‘https://‘, requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) 
import copy
sp = copy.deepcopy(s)
proxies = {‘http‘: ‘http://127.0.0.1:1080‘, ‘https‘: ‘https://127.0.0.1:1080‘}
sp.proxies = proxies 

from urllib3.exceptions import InsecureRequestWarning
from warnings import filterwarnings
filterwarnings(‘ignore‘, category = InsecureRequestWarning)

html_template = u"""


        {content}
    

"""

# https://wkhtmltopdf.org/usage/wkhtmltopdf.txt
options = {
    ‘page-size‘: ‘A4‘,  # Letter
    ‘minimum-font-size‘: 25,  ###
    # ‘image-dpi‘:1500, ###
    
    ‘margin-top‘: ‘0.1in‘,  #0.75in
    ‘margin-right‘: ‘0.1in‘,
    ‘margin-bottom‘: ‘0.1in‘,
    ‘margin-left‘: ‘0.1in‘,
    ‘encoding‘: ‘UTF-8‘,  #支持中文
    ‘custom-header‘: [
        (‘Accept-Encoding‘, ‘gzip‘)
    ],
    ‘cookie‘: [
        (‘cookie-name1‘, ‘cookie-value1‘),
        (‘cookie-name2‘, ‘cookie-value2‘),
    ],
    ‘outline-depth‘: 10,
}


class HTMLtoPDF(object):

    def __init__(self, seed_url, font_size=25, css_links=‘div[class="wy-menu wy-menu-vertical"] a::attr(href)‘,
                css_content=‘div.rst-content‘, threads_count=30):
        self.seed_url = seed_url
        options[‘minimum-font-size‘] = font_size
        
        self.netloc = urlparse.urlparse(seed_url).netloc 
        print self.netloc
        self.folder = os.path.join(sys.path[0], self.netloc)
        self.folder_temp = os.path.join(sys.path[0], ‘temp‘)
        for f in [self.folder, self.folder_temp]:
            if not os.path.isdir(f):
                os.mkdir(f)
        
        self.css_content = css_content
        self.css_links = css_links
        
        self.threads_count = threads_count
        # self.lock = threading.Lock()
        self.links_queue = Queue.Queue()     
        
        self.links_queue.put((‘0‘, self.seed_url))
        self.get_links()
        self.htmls_saved = [] 
        
        # 验证 re.findall(pattern, s)
        # 
        self.img_scr_pattern = re.compile(r‘(]*?src\s*=\s*")(?P.*?)(".*?>)‘)  #最后不能简写成 " ，否则结果缺 "

        
        # issue 1008
        # text为空，也能匹配到 m.group(4)=‘‘
        self.a_href_pattern = re.compile(r‘(]*?href\s*=\s*")(?P.*?)(".*?>)(?P.*?)()‘)
        
        # http://www.seleniumhq.org/docs/ 合体。。。text为 
        # 

        
    def get_links(self):
        text = self.load_page(self.seed_url)
        sel = Selector(text=text)
        
        # [u‘#selenium-documentation‘,
        # u‘00_Note_to-the-reader.jsp‘,
        # u‘01_introducing_selenium.jsp‘,
        # u‘01_introducing_selenium.jsp#test-automation-for-web-applications‘,    

        # links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(‘div[class="toctree-wrapper compound"] a::attr(href)‘).extract()]        
        links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(self.css_links).extract()]        
        links_seen = [self.seed_url]
        
        for link in links:  #set(links) 会导致乱序,使用urls_seen 去重
            link_abs = urlparse.urljoin(self.seed_url, link)
            if link_abs not in links_seen:
                self.links_queue.put((str(len(links_seen)), link_abs))
                links_seen.append(link_abs)

    def save_html(self):
        while True:
            try:
                (num, url) = self.links_queue.get()
                text = self.load_page(url)
                
                title, content = self.parse_page(url, text)
                
                filename_cn = u‘{}_{}.html‘.format(num, re.sub(ur‘[^\u4e00-\u9fa5\w\s()_-]‘, ‘‘, title))  #ur    
                filename = u‘{}_{}.html‘.format(num, re.sub(r‘[^\w\s()_-]‘, ‘‘, title))  #os.path.abspath(‘en/abc.html‘)合成路径 不能是 /en。。
                
                with open(os.path.join(self.folder, filename_cn),‘wb‘) as fp:
                    fp.write(text.encode(‘utf-8‘,‘replace‘))
                f = os.path.join(self.folder_temp, filename)
                with open(f,‘wb‘) as fp:
                    fp.write(content.encode(‘utf-8‘,‘replace‘)) 
                    # fp.write(html_template.format(content=content, title=title).encode(‘utf-8‘,‘replace‘))
                    self.htmls_saved.append(f)
                    print ‘{}/{}‘.format(len(self.htmls_saved), self.links_queue.qsize())
                    
                self.links_queue.task_done()
            except Exception as err:
                print ‘{} {} {}‘.format(url, err, traceback.format_exc())
                
    def run(self):
        threads = []
        for i in range(self.threads_count):
            t = threading.Thread(target=self.save_html)
            threads.append(t)

        for t in threads:
            t.setDaemon(True) 
            t.start() 
            
        self.links_queue.join()
        print ‘load done‘
        
        def func(filename):
            _, filename =os.path.split(filename)
            return int(filename[:filename.index(‘_‘)])
        
        self.htmls_saved.sort(key=lambda x:func(x))
        pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
        print self.netloc, ‘pdf done‘
        

    def load_page(self, url):
            
        resp = sp.get(url)  ###############

        if resp.encoding == ‘ISO-8859-1‘:
            encodings = requests.utils.get_encodings_from_content(resp.content)  #re.compile(r‘if encodings:
                resp.encoding = encodings[0]
            else:
                resp.encoding = resp.apparent_encoding  #models.py  chardet.detect(self.content)[‘encoding‘]
            # print ‘ISO-8859-1 changed to %s‘%resp.encoding
            
        return resp.text   

    def parse_page(self, url, text):
        sel = Selector(text=text)

        title = sel.css(‘head title::text‘).extract_first() or ‘‘  #固定css
        content = sel.css(self.css_content).extract_first() or ‘‘  #‘div.rst-content‘
        
        # sel = sel.css("div#rst-content")[0]  ###缩小范围
        content = self.clean_content(content)
        content = self.modify_content(url, content)  
        
        return title, content
  
    def clean_content(self, content):
        sel = Selector(text=content)
        # content = content.replace(sel.css(‘div#codeLanguagePreference‘).extract_first(), ‘‘) #可能是None
        for div in sel.css(‘div#codeLanguagePreference‘).extract():
            content = content.replace(div, ‘‘)
        
        for lang in [‘java‘, ‘csharp‘, ‘ruby‘, ‘php‘, ‘perl‘, ‘javascript‘]:
            for div in sel.css(‘div.highlight-%s‘%lang).extract():
                # print len(content)
                content = content.replace(div, ‘‘)
                
        return content
        
    def modify_content(self, url, content):
        # m.group(1)=‘abc‘ SyntaxError: can‘t assign to function call 不能直接赋值
        
        # https://doc.scrapy.org/en/latest/topics/firebug.html
        # ../_images/firebug1.png
        # 异常 urlparse.urljoin(self.seed_url, src)
        
        # r‘(]*?src\s*=\s*")(?P.*?)(".*?>)‘
        def func_src(m):
            src = m.group(‘src‘)  #别名
            if not src.startswith(‘http‘):
                src = urlparse.urljoin(url, src)
            return u‘{}{}{}‘.format(m.group(1), src, m.group(3))

        content = re.sub(self.img_scr_pattern, func_src, content)

        
        # re.compile(r‘(]*?href\s*=\s*")(?P.*?)(".*?>)(?P.*?)()‘)
        def func_href(m):
            href = m.group(‘href‘)
            text = m.group(‘text‘)
            if not href.startswith(‘#‘):
                if not href.startswith(‘http‘):
                    href = urlparse.urljoin(url, href)
                text = u‘{text} ({href})‘.format(text=text, href=href)
            return u‘{g1}{href}{g3}{text}{g5}‘.format(g1=m.group(1), g3=m.group(3), g5=m.group(5), href=href, text=text)
            #m.string是content全文。。。也不能 return m

        content = re.sub(self.a_href_pattern, func_href, content)   

        return content
        
    
    def modify_content2(self, url, content):
        sel = Selector(text=content)

        # 修改图片链接为绝对链接，否则pdf无法图片        
        # 
        for i in sel.css(‘img[src]‘):
            tag = i.extract()
            src = i.xpath(‘./@src‘).extract_first()
            if not src.startswith(‘http‘):
                src_abs = urlparse.urljoin(url, src)
                # print src, src_abs
                tag_new = tag.replace(src, src_abs)     
                content = content.replace(tag, tag_new)  #可能alt(同src...)
                
        # a href 的text添加href信息
        # issue 1008
        for i in sel.css(‘a[href]‘):
            tag = i.extract()
            href = i.xpath(‘./@href‘).extract_first()
            text = i.xpath(‘./text()‘).extract_first()
            
            # 补全内部链接，忽略本页面的#定位
            if not href.startswith(‘http‘) and not href.startswith(‘#‘):
                href_abs = urlparse.urljoin(url, href)
                # print href, href_abs
                tag_new = tag.replace(href, href_abs)
            else:
                href_abs = href
                tag_new = tag
                
            # 图标链接，如果text为None，replace表现异常
            if text and not href.startswith(‘#‘):
                text_new = u‘{} ({})‘.format(text, href_abs)
                # print text.encode(‘gbk‘,‘replace‘), text_new.encode(‘gbk‘,‘replace‘)
                tag_new = tag_new.replace(text, text_new)          
            
            # 保证整体替换   
            content = content.replace(tag, tag_new)  
        
        return content



if __name__ == ‘__main__‘:
    url = ‘https://doc.scrapy.org/en/latest/index.html‘
    # obj = HTMLtoPDF(url)  
    
    url = ‘http://python3-cookbook.readthedocs.io/zh_CN/latest/index.html‘
    # obj = HTMLtoPDF(url, font_size=20, css_links=‘div[class="toctree-wrapper compound"] a::attr(href)‘) 
    
    url = ‘http://www.seleniumhq.org/docs/‘
    obj = HTMLtoPDF(url, css_links=‘div#selenium-documentation a::attr(href)‘, css_content=‘div#mainContent‘)  
    
    obj.run()
html2doc
标签：error: rom headers 就是 oct mob 定位 _id agent
原文地址：http://www.cnblogs.com/my8100/p/7738366.html
html2doc

1.参考

2.安装

3.背景知识

3.1url 相对路径绝对路径

3.2页面布局规律

3.2转换pdf时注意事项

等标题 tag 自动生成目录

4.实践代码

评论

热门文章

推荐文章

最新文章

置顶文章

html2doc

1.参考

2.安装

3.背景知识

3.1url 相对路径 绝对路径

3.2页面布局规律

3.2转换pdf时注意事项

等标题 tag 自动生成目录

4.实践代码

评论

热门文章

推荐文章

最新文章

置顶文章

3.1url 相对路径绝对路径