Cách chuyển trang web thành PDF bằng Python

Question 1

Tôi đang tìm giải pháp để in trang web thành tệp cục bộ PDF bằng Python. một trong những giải pháp tốt là sử dụng Qt, được tìm thấy tại đây, https://bharatikunal.wordpress.com/2010/01/ .

Nó không hoạt động ngay từ đầu vì tôi gặp sự cố với quá trình cài đặt PyQt4 vì nó đưa ra các thông báo lỗi như ' ImportError: No module named PyQt4.QtCore' và ' ImportError: No module named PyQt4.QtCore'.

Đó là do PyQt4 không được cài đặt đúng cách. Tôi đã từng có các thư viện đặt tại C: \ Python27 \ Lib nhưng nó không dành cho PyQt4.

Trên thực tế, nó chỉ cần tải xuống từ http://www.riverbankcomputing.com/software/pyqt/download (nhớ phiên bản Python chính xác mà bạn đang sử dụng) và cài đặt nó vào C: \ Python27 (trường hợp của tôi). Đó là nó.

Bây giờ các tập lệnh chạy tốt nên tôi muốn chia sẻ nó. để có thêm tùy chọn trong việc sử dụng Qprinter, vui lòng tham khảo http://qt-project.org/doc/qt-4.8/qprinter.html#Orientation-enum .

Question 2

Bạn cũng có thể sử dụng pdfkit :

Sử dụng

import pdfkit
pdfkit.from_url('http://google.com', 'out.pdf')

Tải về

Hệ điều hành Mac: brew install Caskroom/cask/wkhtmltopdf

Debian / Ubuntu: apt-get install wkhtmltopdf

Các cửa sổ: choco install wkhtmltopdf

Xem tài liệu chính thức cho MacOS / Ubuntu / hệ điều hành khác: https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf

Question 3

WeasyPrint

pip install weasyprint  # No longer supports Python 2.x.

python
>>> import weasyprint
>>> pdf = weasyprint.HTML('http://www.google.com').write_pdf()
>>> len(pdf)
92059
>>> open('google.pdf', 'wb').write(pdf)

Question 4

nhờ các bài đăng dưới đây và tôi có thể thêm vào địa chỉ liên kết trang web sẽ được in và hiện tại trên tệp PDF được tạo, bất kể nó có bao nhiêu trang.

Thêm văn bản vào PDF hiện có bằng Python

https://github.com/disflux/django-mtr/blob/master/pdfgen/doc_overlay.py

Để chia sẻ script như bên dưới:

import time
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from xhtml2pdf import pisa
import sys 
from PyQt4.QtCore import *
from PyQt4.QtGui import * 
from PyQt4.QtWebKit import * 

url = 'http://www.yahoo.com'
tem_pdf = "c:\\tem_pdf.pdf"
final_file = "c:\\younameit.pdf"

app = QApplication(sys.argv)
web = QWebView()
#Read the URL given
web.load(QUrl(url))
printer = QPrinter()
#setting format
printer.setPageSize(QPrinter.A4)
printer.setOrientation(QPrinter.Landscape)
printer.setOutputFormat(QPrinter.PdfFormat)
#export file as c:\tem_pdf.pdf
printer.setOutputFileName(tem_pdf)

def convertIt():
    web.print_(printer)
    QApplication.exit()

QObject.connect(web, SIGNAL("loadFinished(bool)"), convertIt)

app.exec_()
sys.exit

# Below is to add on the weblink as text and present date&time on PDF generated

outputPDF = PdfFileWriter()
packet = StringIO.StringIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.setFont("Helvetica", 9)
# Writting the new line
oknow = time.strftime("%a, %d %b %Y %H:%M")
can.drawString(5, 2, url)
can.drawString(605, 2, oknow)
can.save()

#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(file(tem_pdf, "rb"))
pages = existing_pdf.getNumPages()
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
for x in range(0,pages):
    page = existing_pdf.getPage(x)
    page.mergePage(new_pdf.getPage(0))
    output.addPage(page)
# finally, write "output" to a real file
outputStream = file(final_file, "wb")
output.write(outputStream)
outputStream.close()

print final_file, 'is ready.'

Question 5

đây là một trong những hoạt động tốt:

import sys 
from PyQt4.QtCore import *
from PyQt4.QtGui import * 
from PyQt4.QtWebKit import * 

app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://www.yahoo.com"))
printer = QPrinter()
printer.setPageSize(QPrinter.A4)
printer.setOutputFormat(QPrinter.PdfFormat)
printer.setOutputFileName("fileOK.pdf")

def convertIt():
    web.print_(printer)
    print("Pdf generated")
    QApplication.exit()

QObject.connect(web, SIGNAL("loadFinished(bool)"), convertIt)
sys.exit(app.exec_())

Question 6

Đây là một giải pháp đơn giản bằng cách sử dụng QT. Tôi thấy đây là một phần của câu trả lời cho một câu hỏi khác trên StackOverFlow. Tôi đã thử nghiệm nó trên Windows.

from PyQt4.QtGui import QTextDocument, QPrinter, QApplication

import sys
app = QApplication(sys.argv)

doc = QTextDocument()
location = "c://apython//Jim//html//notes.html"
html = open(location).read()
doc.setHtml(html)

printer = QPrinter()
printer.setOutputFileName("foo.pdf")
printer.setOutputFormat(QPrinter.PdfFormat)
printer.setPageSize(QPrinter.A4);
printer.setPageMargins (15,15,15,15,QPrinter.Millimeter);

doc.print_(printer)
print "done!"

Question 7

Tôi đã thử câu trả lời @NorthCat bằng pdfkit.

Nó yêu cầu wkhtmltopdf được cài đặt. Bản cài đặt có thể được tải xuống từ đây. https://wkhtmltopdf.org/downloads.html

Cài đặt tệp thực thi. Sau đó, viết một dòng để cho biết wkhtmltopdf ở đâu, như bên dưới. (được tham chiếu từ Không thể tạo pdf bằng python PDFKIT Lỗi: "Không tìm thấy tệp thực thi wkhtmltopdf:"

import pdfkit


path_wkthmltopdf = "C:\\Folder\\where\\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf = path_wkthmltopdf)

pdfkit.from_url("http://google.com", "out.pdf", configuration=config)

Question 8

Giải pháp này phù hợp với tôi khi sử dụng PyQt5 phiên bản 5.15.0

import sys
from PyQt5 import QtWidgets, QtWebEngineWidgets
from PyQt5.QtCore import QUrl
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWidgets import QApplication

if __name__ == '__main__':
    app = QtWidgets.QApplication(sys.argv)
    loader = QtWebEngineWidgets.QWebEngineView()
    loader.setZoomFactor(1)
    layout = QPageLayout()
    layout.setPageSize(QPageSize(QPageSize.A4Extra))
    layout.setOrientation(QPageLayout.Portrait)
    loader.load(QUrl('/programming/23359083/how-to-convert-webpage-into-pdf-by-using-python'))
    loader.page().pdfPrintingFinished.connect(lambda *args: QApplication.exit())

    def emit_pdf(finished):
        loader.page().printToPdf("test.pdf", pageLayout=layout)

    loader.loadFinished.connect(emit_pdf)
    sys.exit(app.exec_())

Question 9

Nếu bạn sử dụng selen và chromium, bạn không cần phải tự quản lý cookie và bạn có thể tạo trang pdf từ bản in của chromium dưới dạng pdf. Bạn có thể tham khảo dự án này để hiện thực hóa nó. https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter

cơ sở đã sửa đổi> https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter/blob/master/sample/html_to_pdf_converter.py

import sys
import json, base64


def send_devtools(driver, cmd, params={}):
    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
    url = driver.command_executor._url + resource
    body = json.dumps({'cmd': cmd, 'params': params})
    response = driver.command_executor._request('POST', url, body)
    return response.get('value')


def get_pdf_from_html(driver, url, print_options={}, output_file_path="example.pdf"):
    driver.get(url)

    calculated_print_options = {
        'landscape': False,
        'displayHeaderFooter': False,
        'printBackground': True,
        'preferCSSPageSize': True,
    }
    calculated_print_options.update(print_options)
    result = send_devtools(driver, "Page.printToPDF", calculated_print_options)
    data = base64.b64decode(result['data'])
    with open(output_file_path, "wb") as f:
        f.write(data)



# example
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

url = "/programming/23359083/how-to-convert-webpage-into-pdf-by-using-python#"
webdriver_options = Options()
webdriver_options.add_argument("--no-sandbox")
webdriver_options.add_argument('--headless')
webdriver_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chromedriver, options=webdriver_options)
get_pdf_from_html(driver, url)
driver.quit()