多进程#
Multiprocessing
PyMuPDF 不支持在多个线程上运行——这样做可能会导致不正确的行为,甚至使 Python 崩溃。
但是,可以通过使用 Python 的 multiprocessing 模块来以多种方式进行处理。
如果你希望加速大文档的按页处理,可以使用以下脚本作为起点。它的处理速度至少是相应顺序处理的两倍。
See code
"""
Demonstrate the use of multiprocessing with PyMuPDF.
Depending on the number of CPUs, the document is divided in page ranges.
Each range is then worked on by one process.
The type of work would typically be text extraction or page rendering. Each
process must know where to put its results, because this processing pattern
does not include inter-process communication or data sharing.
Compared to sequential processing, speed improvements in range of 100% (ie.
twice as fast) or better can be expected.
"""
from __future__ import print_function, division
import sys
import os
import time
from multiprocessing import Pool, cpu_count
import pymupdf
# choose a version specific timer function (bytes == str in Python 2)
mytime = time.clock if str is bytes else time.perf_counter
def render_page(vector):
"""Render a page range of a document.
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
mat = vector[3] # the matrix for rendering
doc = pymupdf.open(filename) # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
page = doc[i]
# page.get_text("rawdict") # use any page-related type of work here, eg
pix = page.get_pixmap(alpha=False, matrix=mat)
# store away the result somewhere ...
# pix.save("p-%i.png" % i)
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
if __name__ == "__main__":
t0 = mytime() # start a timer
filename = sys.argv[1]
mat = pymupdf.Matrix(0.2, 0.2) # the rendering matrix: scale down to 20%
cpu = cpu_count()
# make vectors of arguments for the processes
vectors = [(i, cpu, filename, mat) for i in range(cpu)]
print("Starting %i processes for '%s'." % (cpu, filename))
pool = Pool() # make pool of 'cpu_count()' processes
pool.map(render_page, vectors, 1) # start processes passing each a vector
t1 = mytime() # stop the timer
print("Total time %g seconds" % round(t1 - t0, 2))
这是一个更复杂的示例,涉及主进程(显示 GUI)与子进程(执行 PyMuPDF 文档访问)之间的进程间通信。
PyMuPDF does not support running on multiple threads - doing so may cause incorrect behaviour or even crash Python itself.
However, there is the option to use Python’s multiprocessing module in a variety of ways.
If you are looking to speed up page-oriented processing for a large document, use this script as a starting point. It should be at least twice as fast as the corresponding sequential processing.
See code
"""
Demonstrate the use of multiprocessing with PyMuPDF.
Depending on the number of CPUs, the document is divided in page ranges.
Each range is then worked on by one process.
The type of work would typically be text extraction or page rendering. Each
process must know where to put its results, because this processing pattern
does not include inter-process communication or data sharing.
Compared to sequential processing, speed improvements in range of 100% (ie.
twice as fast) or better can be expected.
"""
from __future__ import print_function, division
import sys
import os
import time
from multiprocessing import Pool, cpu_count
import pymupdf
# choose a version specific timer function (bytes == str in Python 2)
mytime = time.clock if str is bytes else time.perf_counter
def render_page(vector):
"""Render a page range of a document.
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
mat = vector[3] # the matrix for rendering
doc = pymupdf.open(filename) # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
page = doc[i]
# page.get_text("rawdict") # use any page-related type of work here, eg
pix = page.get_pixmap(alpha=False, matrix=mat)
# store away the result somewhere ...
# pix.save("p-%i.png" % i)
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
if __name__ == "__main__":
t0 = mytime() # start a timer
filename = sys.argv[1]
mat = pymupdf.Matrix(0.2, 0.2) # the rendering matrix: scale down to 20%
cpu = cpu_count()
# make vectors of arguments for the processes
vectors = [(i, cpu, filename, mat) for i in range(cpu)]
print("Starting %i processes for '%s'." % (cpu, filename))
pool = Pool() # make pool of 'cpu_count()' processes
pool.map(render_page, vectors, 1) # start processes passing each a vector
t1 = mytime() # stop the timer
print("Total time %g seconds" % round(t1 - t0, 2))
Here is a more complex example involving inter-process communication between a main process (showing a GUI) and a child process doing PyMuPDF access to a document.
See code
"""
Created on 2019-05-01
@author: yinkaisheng@live.com
@copyright: 2019 yinkaisheng@live.com
@license: GNU AFFERO GPL 3.0
Demonstrate the use of multiprocessing with PyMuPDF
-----------------------------------------------------
This example shows some more advanced use of multiprocessing.
The main process show a Qt GUI and establishes a 2-way communication with
another process, which accesses a supported document.
"""
import os
import sys
import time
import multiprocessing as mp
import queue
import pymupdf
''' PyQt and PySide namespace unifier shim
https://www.pythonguis.com/faq/pyqt6-vs-pyside6/
simple "if 'PyQt6' in sys.modules:" test fails for me, so the more complex pkgutil use
overkill for most people who might have one or the other, why both?
'''
from pkgutil import iter_modules
def module_exists(module_name):
return module_name in (name for loader, name, ispkg in iter_modules())
if module_exists("PyQt6"):
# PyQt6
from PyQt6 import QtGui, QtWidgets, QtCore
from PyQt6.QtCore import pyqtSignal as Signal, pyqtSlot as Slot
wrapper = "PyQt6"
elif module_exists("PySide6"):
# PySide6
from PySide6 import QtGui, QtWidgets, QtCore
from PySide6.QtCore import Signal, Slot
wrapper = "PySide6"
my_timer = time.clock if str is bytes else time.perf_counter
class DocForm(QtWidgets.QWidget):
def __init__(self):
super().__init__()
self.process = None
self.queNum = mp.Queue()
self.queDoc = mp.Queue()
self.page_count = 0
self.curPageNum = 0
self.lastDir = ""
self.timerSend = QtCore.QTimer(self)
self.timerSend.timeout.connect(self.onTimerSendPageNum)
self.timerGet = QtCore.QTimer(self)
self.timerGet.timeout.connect(self.onTimerGetPage)
self.timerWaiting = QtCore.QTimer(self)
self.timerWaiting.timeout.connect(self.onTimerWaiting)
self.initUI()
def initUI(self):
vbox = QtWidgets.QVBoxLayout()
self.setLayout(vbox)
hbox = QtWidgets.QHBoxLayout()
self.btnOpen = QtWidgets.QPushButton("OpenDocument", self)
self.btnOpen.clicked.connect(self.openDoc)
hbox.addWidget(self.btnOpen)
self.btnPlay = QtWidgets.QPushButton("PlayDocument", self)
self.btnPlay.clicked.connect(self.playDoc)
hbox.addWidget(self.btnPlay)
self.btnStop = QtWidgets.QPushButton("Stop", self)
self.btnStop.clicked.connect(self.stopPlay)
hbox.addWidget(self.btnStop)
self.label = QtWidgets.QLabel("0/0", self)
self.label.setFont(QtGui.QFont("Verdana", 20))
hbox.addWidget(self.label)
vbox.addLayout(hbox)
self.labelImg = QtWidgets.QLabel("Document", self)
sizePolicy = QtWidgets.QSizePolicy(
QtWidgets.QSizePolicy.Policy.Preferred, QtWidgets.QSizePolicy.Policy.Expanding
)
self.labelImg.setSizePolicy(sizePolicy)
vbox.addWidget(self.labelImg)
self.setGeometry(100, 100, 400, 600)
self.setWindowTitle("PyMuPDF Document Player")
self.show()
def openDoc(self):
path, _ = QtWidgets.QFileDialog.getOpenFileName(
self,
"Open Document",
self.lastDir,
"All Supported Files (*.pdf;*.epub;*.xps;*.oxps;*.cbz;*.fb2);;PDF Files (*.pdf);;EPUB Files (*.epub);;XPS Files (*.xps);;OpenXPS Files (*.oxps);;CBZ Files (*.cbz);;FB2 Files (*.fb2)",
#options=QtWidgets.QFileDialog.Options(),
)
if path:
self.lastDir, self.file = os.path.split(path)
if self.process:
self.queNum.put(-1) # use -1 to notify the process to exit
self.timerSend.stop()
self.curPageNum = 0
self.page_count = 0
self.process = mp.Process(
target=openDocInProcess, args=(path, self.queNum, self.queDoc)
)
self.process.start()
self.timerGet.start(40)
self.label.setText("0/0")
self.queNum.put(0)
self.startTime = time.perf_counter()
self.timerWaiting.start(40)
def playDoc(self):
self.timerSend.start(500)
def stopPlay(self):
self.timerSend.stop()
def onTimerSendPageNum(self):
if self.curPageNum < self.page_count - 1:
self.queNum.put(self.curPageNum + 1)
else:
self.timerSend.stop()
def onTimerGetPage(self):
try:
ret = self.queDoc.get(False)
if isinstance(ret, int):
self.timerWaiting.stop()
self.page_count = ret
self.label.setText("{}/{}".format(self.curPageNum + 1, self.page_count))
else: # tuple, pixmap info
num, samples, width, height, stride, alpha = ret
self.curPageNum = num
self.label.setText("{}/{}".format(self.curPageNum + 1, self.page_count))
fmt = (
QtGui.QImage.Format.Format_RGBA8888
if alpha
else QtGui.QImage.Format.Format_RGB888
)
qimg = QtGui.QImage(samples, width, height, stride, fmt)
self.labelImg.setPixmap(QtGui.QPixmap.fromImage(qimg))
except queue.Empty as ex:
pass
def onTimerWaiting(self):
self.labelImg.setText(
'Loading "{}", {:.2f}s'.format(
self.file, time.perf_counter() - self.startTime
)
)
def closeEvent(self, event):
self.queNum.put(-1)
event.accept()
def openDocInProcess(path, queNum, quePageInfo):
start = my_timer()
doc = pymupdf.open(path)
end = my_timer()
quePageInfo.put(doc.page_count)
while True:
num = queNum.get()
if num < 0:
break
page = doc.load_page(num)
pix = page.get_pixmap()
quePageInfo.put(
(num, pix.samples, pix.width, pix.height, pix.stride, pix.alpha)
)
doc.close()
print("process exit")
if __name__ == "__main__":
app = QtWidgets.QApplication(sys.argv)
form = DocForm()
sys.exit(app.exec())