depdf package¶
Subpackages¶
Submodules¶
depdf.api module¶
-
depdf.api.
api_load_pdf
(api_func)¶
-
depdf.api.
convert_page_to_html
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page html string
-
depdf.api.
convert_pdf_to_html
(pdf, **kwargs)¶ Parameters: - pdf – pdf file path
- kwargs – config keyword arguments
Returns: pdf html string
-
depdf.api.
extract_page_paragraphs
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page paragraphs list
-
depdf.api.
extract_page_tables
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page tables list
depdf.base module¶
-
class
depdf.base.
Base
¶ Bases:
object
-
html
¶
-
refresh
()¶
-
reset
()¶
-
soup
¶
-
to_dict
¶
-
to_soup
(parser)¶
-
write_to
(file_name)¶
-
-
class
depdf.base.
Box
¶ Bases:
object
-
bbox
¶
-
bottom
= Decimal('0')¶
-
height
¶
-
static
normalize_bbox
(bbox)¶
-
top
= Decimal('0')¶
-
width
¶
-
x0
= Decimal('0')¶
-
x1
= Decimal('0')¶
-
-
class
depdf.base.
InnerWrapper
¶ Bases:
depdf.base.Base
-
inner_objects
¶
-
to_dict
¶
-
depdf.config module¶
-
class
depdf.config.
Config
(**kwargs)¶ Bases:
object
-
add_horizontal_line_tolerance
= Decimal('0.1')¶
-
add_horizontal_lines_flag
= False¶
-
add_line_flag
= False¶
-
add_vertical_lines_flag
= False¶
-
char_overlap_size
= 3¶
-
char_size_lower
= Decimal('3')¶
-
char_size_upper
= Decimal('30')¶
-
column_region_half_width
= 4¶
-
copy
(**kwargs)¶
-
curved_line_flag
= False¶
-
debug_flag
= False¶
-
default_char_size
= Decimal('12')¶
-
default_head_tail_page_offset_percent
= 0.1¶
-
dotted_line_flag
= True¶
-
image_class
= 'pdf-image'¶
-
image_flag
= True¶
-
image_resolution
= 300¶
-
log_level
= 30¶
-
logo_flag
= True¶
-
main_frame_tolerance
= None¶
-
max_columns
= 3¶
-
max_double_line_tolerance
= 3¶
-
min_column_region_objects
= 1¶
-
min_double_line_tolerance
= Decimal('0.05')¶
-
min_image_size
= 80¶
-
mini_page_class
= 'pdf-mini-page'¶
-
multiple_columns_flag
= True¶
-
page_class
= 'pdf-page'¶
-
page_num_left_fraction
= Decimal('0.44')¶
-
page_num_right_fraction
= Decimal('0.56')¶
-
page_num_top_fraction
= Decimal('0.75')¶
-
paragraph_class
= 'pdf-paragraph'¶
-
paragraph_flag
= True¶
-
pdf_class
= 'pdf-content'¶
-
resolution
= 144¶
-
skip_empty_table
= False¶
-
snap_flag
= False¶
-
span_class
= 'pdf-span'¶
-
table_cell_merge_tolerance
= 5¶
-
table_class
= 'pdf-table'¶
-
table_flag
= True¶
-
temp_dir_prefix
= 'temp_depdf'¶
-
to_dict
¶
-
unique_prefix
= None¶
-
update
(**kwargs)¶
-
verbose_flag
= False¶
-
vertical_double_line_tolerance
= Decimal('2')¶
-
x_tolerance
= None¶
-
y_tolerance
= None¶
-
-
depdf.config.
check_config
(func)¶
-
depdf.config.
check_config_type
(config)¶
depdf.error module¶
-
exception
depdf.error.
BoxValueError
(value)¶ Bases:
ValueError
-
exception
depdf.error.
ConfigTypeError
(value)¶ Bases:
TypeError
-
exception
depdf.error.
PDFTypeError
(value)¶ Bases:
TypeError
-
exception
depdf.error.
PageTypeError
(value)¶ Bases:
TypeError
depdf.page module¶
-
class
depdf.page.
DePage
(page, pid='1', same=None, logo=None, config=None, columns=1, mini=False)¶ Bases:
depdf.base.Base
-
analyze_lines
()¶
-
analyze_main_frame
()¶
-
analyze_page_attributes
()¶
-
analyze_paragraph_border
()¶
-
ave_cs
= 0¶
-
ave_lh
= 3¶
-
border
= (0, 0, 0, 0)¶
-
chars
¶
-
check_if_toc_page
()¶
-
check_multi_column_page
()¶
-
config
¶
-
debug
= False¶
-
extract_images
()¶
-
extract_paragraph
()¶
-
extract_phrases
()¶
-
extract_tables
()¶
-
frame_bottom
= 0¶
-
frame_top
= 0¶
-
h_edges
= []¶
-
height
¶
-
html
¶
-
images
¶
-
images_raw
¶
-
min_cs
= 0¶
-
new_para_end_flag
= None¶
-
new_para_start_flag
= None¶
-
object_key_list
= ['_tables', '_paragraphs', '_images']¶
-
objects
¶
-
orientation
= ''¶
-
page
¶
-
pagination_phrases
= []¶
-
paragraphs
¶
-
phrases
= None¶
-
pid
¶
-
prefix
= UUID('e46d017a-0ce2-4c4a-98e0-2d0903969c58')¶
-
process_mini_page
()¶
-
process_page
()¶
-
refresh
()¶
-
save_html
()¶
-
screenshot
¶
-
set_global
()¶
-
tables
¶
-
tables_raw
¶
-
temp_dir
= 'temp'¶
-
to_html
¶
-
to_screenshot
()¶
-
toc_flag
= False¶
-
v_edges
= []¶
-
verbose
= False¶
-
width
¶
-
x_tolerance
= 3¶
-
y_tolerance
= 3¶
-
-
class
depdf.page.
MiniDePage
(page, pid='1', same=None, logo=None, config=None, columns=1, mini=False)¶ Bases:
depdf.page.DePage
-
save_html
()¶
-
to_html
¶
-
-
depdf.page.
check_page_type
(page)¶
-
depdf.page.
convert_plumber_table
(pdf_page, table, pid='1', tid=1, config=None, min_cs=1)¶
-
depdf.page.
extract_cell_region
(cell_region, bbox, config=None, pid='1', tid=1, cid=1)¶
depdf.page_tools module¶
-
depdf.page_tools.
add_horizontal_lines
(v_lines, h_lines, vlts_tolerance=0.1)¶
-
depdf.page_tools.
add_vertical_lines
(v_lines, h_lines, page_rects, page, ave_cs)¶
-
depdf.page_tools.
analyze_char_size
(chars, char_size_upper=30, char_size_lower=3, default_char_size=12)¶
-
depdf.page_tools.
analyze_page_num_word
(phrases, page_height, page_width, top_fraction=Decimal('0.7'), left_fraction=Decimal('0.4'), right_fraction=Decimal('0.6'))¶
-
depdf.page_tools.
analyze_page_orientation
(plumber_page)¶ Parameters: plumber_page – pdfplumber.page.Page class Returns:
-
depdf.page_tools.
calculate_paragraph_border
(depdf_page_object)¶
-
depdf.page_tools.
curve_to_lines
(curves)¶
-
depdf.page_tools.
edges_to_lines
(edges)¶
-
depdf.page_tools.
format_text
(text)¶
-
depdf.page_tools.
merge_page_figures
(pdf_page, tables_raw=None, logo=None, min_width=3, min_height=3, pid='1')¶
-
depdf.page_tools.
remove_duplicate_chars
(chars, overlap_size=3)¶
-
depdf.page_tools.
remove_single_lines
(lines, max_double=3, min_double=0.05, vertical_double=2, m='h')¶
depdf.pdf module¶
-
class
depdf.pdf.
DePDF
(pdf, config=None, **kwargs)¶ Bases:
depdf.base.Base
-
close
()¶
-
config
¶
-
extract_html_pages
()¶
-
generate_pages
()¶
-
get_prefix
()¶
-
html
¶
-
html_pages
¶
-
classmethod
load
(file_name, config=None, **kwargs)¶
-
logo
¶
-
classmethod
open
(*args, **kwargs)¶
-
page_num
¶
-
pages
¶
-
pdf
¶
-
refresh
()¶
-
same
¶
-
save_html
()¶
-
to_html
¶
-
-
depdf.pdf.
check_pdf_type
(pdf)¶
depdf.pdf_tools module¶
-
depdf.pdf_tools.
check_page_orientation
(pdf, pid)¶ Parameters: - pdf – pdfplumber class
- pid – page number starts from 0
Returns:
-
depdf.pdf_tools.
pdf_head_tail
(pdf, config=None)¶ Parameters: - pdf – plumber pdf object
- config – depdf config class
Returns: PDF 文件的页眉和页脚
-
depdf.pdf_tools.
pdf_logo
(pdf)¶
depdf.settings module¶
depdf.utils module¶
-
depdf.utils.
calc_bbox
(objects)¶
-
depdf.utils.
calc_overlap
(a, b)¶ 检查两个线段的重叠部分长度 :param a: [a_lower, a_upper] :param b: [b_lower, b_upper] :return: overlapping length
-
depdf.utils.
construct_style
(style=None)¶
-
depdf.utils.
convert_html_to_soup
(html, parser='html.parser')¶
-
depdf.utils.
convert_soup_to_html
(soup)¶
-
depdf.utils.
repr_str
(text, max_length=5)¶
depdf.version module¶
Module contents¶
depdf¶
An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.
-
class
depdf.
Config
(**kwargs)¶ Bases:
object
-
add_horizontal_line_tolerance
= Decimal('0.1')¶
-
add_horizontal_lines_flag
= False¶
-
add_line_flag
= False¶
-
add_vertical_lines_flag
= False¶
-
char_overlap_size
= 3¶
-
char_size_lower
= Decimal('3')¶
-
char_size_upper
= Decimal('30')¶
-
column_region_half_width
= 4¶
-
copy
(**kwargs)¶
-
curved_line_flag
= False¶
-
debug_flag
= False¶
-
default_char_size
= Decimal('12')¶
-
default_head_tail_page_offset_percent
= 0.1¶
-
dotted_line_flag
= True¶
-
image_class
= 'pdf-image'¶
-
image_flag
= True¶
-
image_resolution
= 300¶
-
log_level
= 30¶
-
logo_flag
= True¶
-
main_frame_tolerance
= None¶
-
max_columns
= 3¶
-
max_double_line_tolerance
= 3¶
-
min_column_region_objects
= 1¶
-
min_double_line_tolerance
= Decimal('0.05')¶
-
min_image_size
= 80¶
-
mini_page_class
= 'pdf-mini-page'¶
-
multiple_columns_flag
= True¶
-
page_class
= 'pdf-page'¶
-
page_num_left_fraction
= Decimal('0.44')¶
-
page_num_right_fraction
= Decimal('0.56')¶
-
page_num_top_fraction
= Decimal('0.75')¶
-
paragraph_class
= 'pdf-paragraph'¶
-
paragraph_flag
= True¶
-
pdf_class
= 'pdf-content'¶
-
resolution
= 144¶
-
skip_empty_table
= False¶
-
snap_flag
= False¶
-
span_class
= 'pdf-span'¶
-
table_cell_merge_tolerance
= 5¶
-
table_class
= 'pdf-table'¶
-
table_flag
= True¶
-
temp_dir_prefix
= 'temp_depdf'¶
-
to_dict
¶
-
unique_prefix
= None¶
-
update
(**kwargs)¶
-
verbose_flag
= False¶
-
vertical_double_line_tolerance
= Decimal('2')¶
-
x_tolerance
= None¶
-
y_tolerance
= None¶
-
-
class
depdf.
DePDF
(pdf, config=None, **kwargs)¶ Bases:
depdf.base.Base
-
close
()¶
-
config
¶
-
extract_html_pages
()¶
-
generate_pages
()¶
-
get_prefix
()¶
-
html
¶
-
html_pages
¶
-
classmethod
load
(file_name, config=None, **kwargs)¶
-
logo
¶
-
classmethod
open
(*args, **kwargs)¶
-
page_num
¶
-
pages
¶
-
pdf
¶
-
refresh
()¶
-
same
¶
-
save_html
()¶
-
to_html
¶
-
-
class
depdf.
DePage
(page, pid='1', same=None, logo=None, config=None, columns=1, mini=False)¶ Bases:
depdf.base.Base
-
analyze_lines
()¶
-
analyze_main_frame
()¶
-
analyze_page_attributes
()¶
-
analyze_paragraph_border
()¶
-
ave_cs
= 0¶
-
ave_lh
= 3¶
-
border
= (0, 0, 0, 0)¶
-
chars
¶
-
check_if_toc_page
()¶
-
check_multi_column_page
()¶
-
config
¶
-
debug
= False¶
-
extract_images
()¶
-
extract_paragraph
()¶
-
extract_phrases
()¶
-
extract_tables
()¶
-
frame_bottom
= 0¶
-
frame_top
= 0¶
-
h_edges
= []¶
-
height
¶
-
html
¶
-
images
¶
-
images_raw
¶
-
min_cs
= 0¶
-
new_para_end_flag
= None¶
-
new_para_start_flag
= None¶
-
object_key_list
= ['_tables', '_paragraphs', '_images']¶
-
objects
¶
-
orientation
= ''¶
-
page
¶
-
pagination_phrases
= []¶
-
paragraphs
¶
-
phrases
= None¶
-
pid
¶
-
prefix
= UUID('e46d017a-0ce2-4c4a-98e0-2d0903969c58')¶
-
process_mini_page
()¶
-
process_page
()¶
-
refresh
()¶
-
save_html
()¶
-
screenshot
¶
-
set_global
()¶
-
tables
¶
-
tables_raw
¶
-
temp_dir
= 'temp'¶
-
to_html
¶
-
to_screenshot
()¶
-
toc_flag
= False¶
-
v_edges
= []¶
-
verbose
= False¶
-
width
¶
-
x_tolerance
= 3¶
-
y_tolerance
= 3¶
-
-
depdf.
convert_pdf_to_html
(pdf, **kwargs)¶ Parameters: - pdf – pdf file path
- kwargs – config keyword arguments
Returns: pdf html string
-
depdf.
convert_page_to_html
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page html string
-
depdf.
extract_page_tables
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page tables list
-
depdf.
extract_page_paragraphs
(pdf, pid, **kwargs)¶ Parameters: - pdf – pdf file path
- pid – page number start from 1
- kwargs – config keyword arguments
Returns: page paragraphs list