Extract table data from pdf
Published
· 4 months ago
pythonpdfpypdf

The screenshot of the first page of the pdf is attached above
The pdf contains a table listed with products and their prices. (Get the pdf -> pricelist.pdf)
The objective is to extract the information from the PDF table and store it in JSON format below.
{
"data": [
{
"category": "CPU INTEL",
"gst": 18,
"products": [
{
"id": 1,
"name": "CELERON (G470)(1155)",
"price": 2090
},
{
"id": 2,
"name": "DUAL CORE G6405",
"price": 4790
},
{
"id": 3,
"name": "GEN 3rd I-3 (1y)",
"price": 1090
},
{
"id": 4,
"name": "GEN 3rd I-5 (1y)",
"price": 1890
},
{
"id": 5,
"name": "GEN 4th I-5 (1y)",
"price": 2390
},
{
"id": 6,
"name": "GEN 10th I-3 (10100)",
"price": 0
},
{
"id": 7,
"name": "GEN 10th I-3 (10105 F)",
"price": 5390
},
{
"id": 8,
"name": "GEN 10th I-3 (10105)",
"price": 8190
},
{
"id": 9,
"name": "GEN 10th I-5 (10400)",
"price": 9790
},
{
"id": 10,
"name": "GEN 10th I-5 (10400F)",
"price": 7690
},
...
]
}
]
}
{
"data": [
{
"category": "CPU INTEL",
"gst": 18,
"products": [
{
"id": 1,
"name": "CELERON (G470)(1155)",
"price": 2090
},
{
"id": 2,
"name": "DUAL CORE G6405",
"price": 4790
},
{
"id": 3,
"name": "GEN 3rd I-3 (1y)",
"price": 1090
},
{
"id": 4,
"name": "GEN 3rd I-5 (1y)",
"price": 1890
},
{
"id": 5,
"name": "GEN 4th I-5 (1y)",
"price": 2390
},
{
"id": 6,
"name": "GEN 10th I-3 (10100)",
"price": 0
},
{
"id": 7,
"name": "GEN 10th I-3 (10105 F)",
"price": 5390
},
{
"id": 8,
"name": "GEN 10th I-3 (10105)",
"price": 8190
},
{
"id": 9,
"name": "GEN 10th I-5 (10400)",
"price": 9790
},
{
"id": 10,
"name": "GEN 10th I-5 (10400F)",
"price": 7690
},
...
]
}
]
}
Approach: Text iteration
- Analyze the pattern of occurrence of the text on pdf page.
- Identify Y coordinates of the table starting and ending.
- Exclude the text outside the table based on the table starting and ending Y coordinates.
- With the filtered text, prepare the JSON.
Warning
Extracting tables from PDF has no thumb rule. Text extraction from a PDF can be pretty tricky. In several cases there is no clear answer what the expected result should look like.
...Read more
Code
We are in search of the text mentioned below.
CPU INTEL 18%
1 CELERON (G470)(1155) 20901
2 DUAL CORE G6405 4790
CPU INTEL 18%
1 CELERON (G470)(1155) 20901
2 DUAL CORE G6405 4790
Let’s write a little program to extract text from the first page
We will print the first 10 texts from the page to identify the arrangement of the texts on the page.
The Python package PyPDF2 provides a function
extract_text
that iterates over the text it encounters on the page.import os
from ast import literal_eval
from pypdf import PdfReader
prod = literal_eval(os.environ["PROD"])
index = 0
def visitor_body(text: str, _cm, _tm, _fontDict, _fontSize):
global index
if index < 10:
print(text)
index += 1
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
if __name__ == "__main__":
main()
import os
from ast import literal_eval
from pypdf import PdfReader
prod = literal_eval(os.environ["PROD"])
index = 0
def visitor_body(text: str, _cm, _tm, _fontDict, _fontSize):
global index
if index < 10:
print(text)
index += 1
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
if __name__ == "__main__":
main()
Output
25-03-2023
COST TO COST
1st Floor, Farm Bhawan, 14-15 Nehru Place
Output
25-03-2023
COST TO COST
1st Floor, Farm Bhawan, 14-15 Nehru Place
We can clearly spot this text on top of the pdf first page. This is not our target text.
Let’s tweak a little bit in
visitor_body
function to get to the target text CPU INTEL 18%
def visitor_body(text: str, _cm, _tm, _fontDict, _fontSize):
if "CPU INTEL" in text:
print(text)
def visitor_body(text: str, _cm, _tm, _fontDict, _fontSize):
if "CPU INTEL" in text:
print(text)
Output
CPU INTEL 18%
Output
CPU INTEL 18%
Since the table starts from "CPU INTEL 18%". This is what we need as the starting point.
Let’s explore
cm
and tm
parameters of visitor_body
function.def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
if "CPU INTEL" in text:
print(text)
print("current transformation matrix", cm)
print("text matrix", tm)
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
if "CPU INTEL" in text:
print(text)
print("current transformation matrix", cm)
print("text matrix", tm)
Output
CPU INTEL 18%
current transformation matrix [0.75, 0.0, 0.0, -0.75, 0.0, 841.920044]
text matrix [1.0, 0.0, 0.0, -1.0, 62.240002, 288.160004]
Output
CPU INTEL 18%
current transformation matrix [0.75, 0.0, 0.0, -0.75, 0.0, 841.920044]
text matrix [1.0, 0.0, 0.0, -1.0, 62.240002, 288.160004]
Index 4 and 5 of the text matrix
tm
are the x and y coordinates of the text “CPU INTEL 18%” on the first pageWe do not need any text above and below the table.
So we have to find the anchor text and their Y coordinates to exclude the texts out that lies outside the table.
Table anchor texts
Position | Text | Reason |
---|---|---|
Top | CPU INTEL 18% | Table starts from this text |
Bottom | G.S.T EXTRA AS APPLICABLE | Table ends above this text |
Writing a function to identify Y coordinates.
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
if "CPU INTEL" in text or "G.S.T EXTRA AS APPLICABLE" in text:
y = tm[5]
print(text, y)
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
if "CPU INTEL" in text or "G.S.T EXTRA AS APPLICABLE" in text:
y = tm[5]
print(text, y)
Output
CPU INTEL 18% 288.160004
G.S.T EXTRA AS APPLICABLE 1037.439941
Output
CPU INTEL 18% 288.160004
G.S.T EXTRA AS APPLICABLE 1037.439941
To discard text above and below the table, ignore all entries which have coordinate
y < 280
and y < 1037.4
respectively.Y_HEADER_ENDS = 280
Y_FOOTER_STARTS = 1037.4
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
x = tm[4]
y = tm[5]
if y > Y_HEADER_ENDS and y < Y_FOOTER_STARTS:
print(text)
Y_HEADER_ENDS = 280
Y_FOOTER_STARTS = 1037.4
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
x = tm[4]
y = tm[5]
if y > Y_HEADER_ENDS and y < Y_FOOTER_STARTS:
print(text)
Output
CPU INTEL 18%
CELERON (G470)(1155)
2090
1
DUAL CORE G6405
4490
2
GEN 3rd I-3 (1y)
1490
3
GEN 3rd I-5 (1y)
2390
4
GEN 4th I-5 (1y)
2890
5
...
Output
CPU INTEL 18%
CELERON (G470)(1155)
2090
1
DUAL CORE G6405
4490
2
GEN 3rd I-3 (1y)
1490
3
GEN 3rd I-5 (1y)
2390
4
GEN 4th I-5 (1y)
2890
5
...
Now we have only the data from our target zone on the page.
Let’s clean the text and store lines in a list
just_table_text = []
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
global just_table_text
x = tm[4]
y = tm[5]
if y > Y_HEADER_ENDS and y < Y_FOOTER_STARTS:
ct = text.strip()
if ct:
just_table_text.append(ct)
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
print(just_table_text)
just_table_text = []
def visitor_body(text: str, cm, tm, _fontDict, _fontSize):
global just_table_text
x = tm[4]
y = tm[5]
if y > Y_HEADER_ENDS and y < Y_FOOTER_STARTS:
ct = text.strip()
if ct:
just_table_text.append(ct)
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
print(just_table_text)
Output
['CPU INTEL 18%',
'CELERON (G470)(1155)',
'2090',
'1',
'DUAL CORE G6405',
'4490',
'2',
'GEN 3rd I-3 (1y)',
'1490',
'3',
'GEN 3rd I-5 (1y)',
'2390',
'4',
'GEN 4th I-5 (1y)',
'2890',
'5',
'GEN 10th I-3 (10100)',
'ASK',
'6',
...]
Output
['CPU INTEL 18%',
'CELERON (G470)(1155)',
'2090',
'1',
'DUAL CORE G6405',
'4490',
'2',
'GEN 3rd I-3 (1y)',
'1490',
'3',
'GEN 3rd I-5 (1y)',
'2390',
'4',
'GEN 4th I-5 (1y)',
'2890',
'5',
'GEN 10th I-3 (10100)',
'ASK',
'6',
...]
Now we have all the product information inside the array. But the texts are not labeled as category, serial number, price, and item.
Creating a function
label_the_texts
that takes input just_table_text
and labels the text.def label_the_texts(raw_text_data_array):
identify_array = [dict() for _ in range(len(raw_text_data_array))]
j = 1
for i, text in enumerate(raw_text_data_array):
if "%" in text:
identify_array[i]["category"] = text
elif "ask" in text.lower():
identify_array[i]['price'] = text
elif text.isdigit():
if int(text) in range(j - 2, j + 2):
identify_array[i]["s_no"] = int(text)
j += 1
else:
identify_array[i]['price'] = int(text)
# occurs on the page where serial number goes beyond 999
elif "1E+03" in text:
identify_array[i]["s_no"] = 1000
else:
identify_array[i]["item"] = text
return identify_array
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
__import__('pprint').pprint(labeled_texts)
def label_the_texts(raw_text_data_array):
identify_array = [dict() for _ in range(len(raw_text_data_array))]
j = 1
for i, text in enumerate(raw_text_data_array):
if "%" in text:
identify_array[i]["category"] = text
elif "ask" in text.lower():
identify_array[i]['price'] = text
elif text.isdigit():
if int(text) in range(j - 2, j + 2):
identify_array[i]["s_no"] = int(text)
j += 1
else:
identify_array[i]['price'] = int(text)
# occurs on the page where serial number goes beyond 999
elif "1E+03" in text:
identify_array[i]["s_no"] = 1000
else:
identify_array[i]["item"] = text
return identify_array
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
__import__('pprint').pprint(labeled_texts)
Output
[{'category': 'CPU INTEL 18%'},
{'item': 'CELERON (G470)(1155)'},
{'price': 2090},
{'s_no': 1},
{'item': 'DUAL CORE G6405'},
{'price': 4490},
{'s_no': 2},
{'item': 'GEN 3rd I-3 (1y)'},
{'price': 1490},
{'s_no': 3},
{'item': 'GEN 3rd I-5 (1y)'},
{'price': 2390},
{'s_no': 4},
{'item': 'GEN 4th I-5 (1y)'},
{'price': 2890},
{'s_no': 5},
{'item': 'GEN 10th I-3 (10100)'},
{'price': 'ASK'},
{'s_no': 6},
{'item': 'GEN 10th I-3 (10105 F)'},
{'price': 5390},
{'s_no': 7}, ...]
Output
[{'category': 'CPU INTEL 18%'},
{'item': 'CELERON (G470)(1155)'},
{'price': 2090},
{'s_no': 1},
{'item': 'DUAL CORE G6405'},
{'price': 4490},
{'s_no': 2},
{'item': 'GEN 3rd I-3 (1y)'},
{'price': 1490},
{'s_no': 3},
{'item': 'GEN 3rd I-5 (1y)'},
{'price': 2390},
{'s_no': 4},
{'item': 'GEN 4th I-5 (1y)'},
{'price': 2890},
{'s_no': 5},
{'item': 'GEN 10th I-3 (10100)'},
{'price': 'ASK'},
{'s_no': 6},
{'item': 'GEN 10th I-3 (10105 F)'},
{'price': 5390},
{'s_no': 7}, ...]
We have labeled the texts successfully. Now it's time to convert the above marked texts to the following format.
{
"data": [
{
"category": "CPU INTEL",
"gst": 18,
"products": [
{
"id": 1,
"name": "CELERON (G470)(1155)",
"price": 2090
},
{
"id": 2,
"name": "DUAL CORE G6405",
"price": 4490
},
{
"id": 3,
"name": "GEN 3rd I-3 (1y)",
"price": 1490
}
]
}
]
}
{
"data": [
{
"category": "CPU INTEL",
"gst": 18,
"products": [
{
"id": 1,
"name": "CELERON (G470)(1155)",
"price": 2090
},
{
"id": 2,
"name": "DUAL CORE G6405",
"price": 4490
},
{
"id": 3,
"name": "GEN 3rd I-3 (1y)",
"price": 1490
}
]
}
]
}
To convert the labeled text array to the JSON, first we need to have a dictionary. I have written a function named
convert_data_to_dictionary
.Using function
convert_data_to_dictionary
in the main function to get the desired result.def new_category(value):
return {'category': value, 'products': []}
def convert_data_to_dictionary(data: List[dict]):
prev_k, prev_v = data[0].popitem()
result = [new_category(prev_v)]
for entry in data[1:]:
k, v = entry.popitem()
if k == 'category':
result.append(new_category(v))
elif (prev_k == 'category' or prev_k == 's_no') and k == "item":
result[-1]['products'].append({"item": v})
elif k == 'item':
result[-1]['products'][-1]["item"] += v
else:
result[-1]['products'][-1][k] = v
prev_k = k
return result
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
__import__('pprint').pprint(curated_dictionary)
def new_category(value):
return {'category': value, 'products': []}
def convert_data_to_dictionary(data: List[dict]):
prev_k, prev_v = data[0].popitem()
result = [new_category(prev_v)]
for entry in data[1:]:
k, v = entry.popitem()
if k == 'category':
result.append(new_category(v))
elif (prev_k == 'category' or prev_k == 's_no') and k == "item":
result[-1]['products'].append({"item": v})
elif k == 'item':
result[-1]['products'][-1]["item"] += v
else:
result[-1]['products'][-1][k] = v
prev_k = k
return result
def main():
reader = PdfReader("pricelist.pdf")
first_page = reader.pages[0]
first_page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
__import__('pprint').pprint(curated_dictionary)
Output
{'data': [{'category': 'CPU INTEL',
'gst': 18,
'products': [{'item': 'CELERON (G470)(1155)',
'price': 2090,
's_no': 1},
{'item': 'DUAL CORE G6405', 'price': 4490, 's_no': 2},
{'item': 'GEN 3rd I-3 (1y)',
'price': 1490,
's_no': 3},
{'item': 'GEN 3rd I-5 (1y)',
'price': 2390,
's_no': 4},
{'item': 'GEN 4th I-5 (1y)',
'price': 2890,
's_no': 5},
{'item': 'GEN 10th I-3 (10100)',
'price': 'ASK',
's_no': 6},
{'item': 'GEN 10th I-3 (10105 F)',
'price': 5390,
's_no': 7},
{'item': 'GEN 10th I-3 (10105)',
'price': 8390,
's_no': 8},
{'item': 'GEN 10th I-5 (10400)',
'price': 9690,
's_no': 9}]
}]
}
Output
{'data': [{'category': 'CPU INTEL',
'gst': 18,
'products': [{'item': 'CELERON (G470)(1155)',
'price': 2090,
's_no': 1},
{'item': 'DUAL CORE G6405', 'price': 4490, 's_no': 2},
{'item': 'GEN 3rd I-3 (1y)',
'price': 1490,
's_no': 3},
{'item': 'GEN 3rd I-5 (1y)',
'price': 2390,
's_no': 4},
{'item': 'GEN 4th I-5 (1y)',
'price': 2890,
's_no': 5},
{'item': 'GEN 10th I-3 (10100)',
'price': 'ASK',
's_no': 6},
{'item': 'GEN 10th I-3 (10105 F)',
'price': 5390,
's_no': 7},
{'item': 'GEN 10th I-3 (10105)',
'price': 8390,
's_no': 8},
{'item': 'GEN 10th I-5 (10400)',
'price': 9690,
's_no': 9}]
}]
}
to run for all pages change the
main()
function as follow.def main():
reader = PdfReader("pricelist.pdf")
for page in reader.pages:
page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
__import__("pprint").pprint({"data": curated_dictionary})
def main():
reader = PdfReader("pricelist.pdf")
for page in reader.pages:
page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
__import__("pprint").pprint({"data": curated_dictionary})
To dump json use
json.dumps()
import json
def main():
reader = PdfReader("pricelist.pdf")
for page in reader.pages:
page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
json.dumps({"data": curated_dictionary})
import json
def main():
reader = PdfReader("pricelist.pdf")
for page in reader.pages:
page.extract_text(visitor_text=visitor_body)
global just_table_text
labeled_texts = label_the_texts(just_table_text)
curated_dictionary = convert_data_to_dictionary(labeled_texts)
json.dumps({"data": curated_dictionary})
We get the json as expected.