Java 读取PDF中表格的工具
目录
本文以Java示例展示读取PDF中的表格的方法。
1、方法1:Spire.PDF
1.1 Maven仓库下载导入
在pom.xml中配置maven路径,指定依赖,如下:
<repositories>
<repository>
<id>com.e-iceblue</id>
<url>https://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf</artifactId>
<version>4.10.2</version>
</dependency>
</dependencies>
1.2 读取PDF中的表格
1.2.1 代码
package com;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.utilities.PdfTable;
import com.spire.pdf.utilities.PdfTableExtractor;
import java.io.FileWriter;
import java.io.IOException;
public class ExtractTable {
public static void main(String[] args)throws IOException {
//加载PDF文档
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile("E:\\文档\\金融数据抽取\\公告\\600519_贵州茅台_贵州茅台2019年年度报告.pdf");
//创建StringBuilder类的实例
StringBuilder builder = new StringBuilder();
//抽取表格
PdfTableExtractor extractor = new PdfTableExtractor(pdf);
PdfTable[] tableLists ;
for (int page = 0; page < pdf.getPages().getCount(); page++) {
tableLists = extractor.extractTable(page);
if (tableLists != null && tableLists.length > 0) {
for (PdfTable table : tableLists)
{
int row = table.getRowCount();
int column = table.getColumnCount();
for (int i = 0; i < row; i++)
{
for (int j = 0; j < column; j++)
{
String text = table.getText(i, j);
builder.append(text+"\t");
}
builder.append("\r\n");
}
}
}
}
//将提取的表格内容写入txt文档
FileWriter fileWriter = new FileWriter("E:\\文档\\金融数据抽取\\公告\\600519_贵州茅台_贵州茅台2019年年度报告.txt");
fileWriter.write(builder.toString());
fileWriter.flush();
fileWriter.close();
}
}
1.2.2 表格内容
1.2.3 读取结果
2、方法2:Tabula
2.1 Maven仓库下载导入
在pom.xml中配置maven路径,指定依赖,如下:
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
2.2 读取PDF中的表格
2.2.1 代码
package com;
import com.alibaba.fastjson.JSONArray;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import technology.tabula.CommandLineApp;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
public class TestPdfTabula {
public static void main(String[] args) throws Exception {
//-f导出格式,默认CSV (一定要大写)
//-p 指导出哪页,all是所有
//path D:\\1xx.pdf
//-l 强制使用点阵模式提取PDF (关键在于这儿)
String[] argsa = new String[]{"-f=JSON","-p=4", "E:\\文档\\金融数据抽取\\公告\\600519_贵州茅台_贵州茅台2019年年度报告.pdf","-l"};
//CommandLineApp.main(argsa);
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), argsa);
StringBuilder stringBuilder = new StringBuilder();
new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
System.out.println("打印返回数据: " + stringBuilder.toString());
}
}
2.2.2 表格内容
2.2.3 抽取结果
[
{
"extraction_method": "lattice",
"top": 149.34004,
"left": 58.485725,
"width": 441.1282958984375,
"height": 98.79637145996094,
"right": 499.614,
"bottom": 248.13641,
"data": [
[
{
"top": 149.34004,
"left": 58.485725,
"width": 441.1282958984375,
"height": 14.144073486328125,
"text": "常用词语释义"
},
{
"top": 0.0,
"left": 0.0,
"width": 0.0,
"height": 0.0,
"text": ""
},
{
"top": 0.0,
"left": 0.0,
"width": 0.0,
"height": 0.0,
"text": ""
}
],
[
{
"top": 163.48412,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.136001586914063,
"text": "证监会"
},
{
"top": 163.48412,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.136001586914063,
"text": "指"
},
{
"top": 163.48412,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.136001586914063,
"text": "中国证券监督管理委员会"
}
],
[
{
"top": 177.62011999999999,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.0400390625,
"text": "上交所"
},
{
"top": 177.62011999999999,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.0400390625,
"text": "指"
},
{
"top": 177.62011999999999,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.0400390625,
"text": "上海证券交易所"
}
],
[
{
"top": 191.66016,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.15972900390625,
"text": "本公司、公司"
},
{
"top": 191.66016,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.15972900390625,
"text": "指"
},
{
"top": 191.66016,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.15972900390625,
"text": "贵州茅台酒股份有限公司"
}
],
[
{
"top": 205.81989,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.160186767578125,
"text": "控股股东"
},
{
"top": 205.81989,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.160186767578125,
"text": "指"
},
{
"top": 205.81989,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.160186767578125,
"text": "中国贵州茅台酒厂(集团)有限责任公司"
}
],
[
{
"top": 219.98007,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.039901733398438,
"text": "报告期"
},
{
"top": 219.98007,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.039901733398438,
"text": "指"
},
{
"top": 219.98007,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.039901733398438,
"text": "2019 年度"
}
],
[
{
"top": 234.01997,
"left": 58.485725,
"width": 106.0539779663086,
"height": 14.116439819335938,
"text": "本报告"
},
{
"top": 234.01997,
"left": 164.5397,
"width": 91.28379821777344,
"height": 14.116439819335938,
"text": "指"
},
{
"top": 234.01997,
"left": 255.8235,
"width": 243.79051208496095,
"height": 14.116439819335938,
"text": "2019 年年度报告"
}
]
]
},
{
"extraction_method": "lattice",
"top": 325.7725,
"left": 62.325226,
"width": 444.7215576171875,
"height": 60.634979248046878,
"right": 507.04678,
"bottom": 386.40747,
"data": [
[
{
"top": 325.7725,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.197784423828125,
"text": "公司的中文名称"
},
{
"top": 325.7725,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.197784423828125,
"text": "贵州茅台酒股份有限公司"
}
],
[
{
"top": 340.97028,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.119720458984375,
"text": "公司的中文简称"
},
{
"top": 340.97028,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.119720458984375,
"text": "贵州茅台"
}
],
[
{
"top": 356.09,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.119781494140625,
"text": "公司的外文名称"
},
{
"top": 356.09,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.119781494140625,
"text": "Kweichow Moutai Co.,Ltd."
}
],
[
{
"top": 371.20978,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.19769287109375,
"text": "公司的法定代表人"
},
{
"top": 371.20978,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.19769287109375,
"text": "高卫东"
}
]
]
},
{
"extraction_method": "lattice",
"top": 433.77307,
"left": 62.325226,
"width": 444.7215576171875,
"height": 84.65386962890625,
"right": 507.04678,
"bottom": 518.42694,
"data": [
[
{
"top": 433.77307,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.13677978515625,
"text": ""
},
{
"top": 433.77307,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.13677978515625,
"text": "董事会秘书"
},
{
"top": 433.77307,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.13677978515625,
"text": "证券事务代表"
}
],
[
{
"top": 447.90985,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.04034423828125,
"text": "姓名"
},
{
"top": 447.90985,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.04034423828125,
"text": "刘刚"
},
{
"top": 447.90985,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.04034423828125,
"text": "陈华"
}
],
[
{
"top": 461.9502,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.15997314453125,
"text": "联系地址"
},
{
"top": 461.9502,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.15997314453125,
"text": "贵州省仁怀市茅台镇"
},
{
"top": 461.9502,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.15997314453125,
"text": "贵州省仁怀市茅台镇"
}
],
[
{
"top": 476.11017,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.16015625,
"text": "电话"
},
{
"top": 476.11017,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.16015625,
"text": "0851-22386002"
},
{
"top": 476.11017,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.16015625,
"text": "0851-22386002"
}
],
[
{
"top": 490.27032,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.040130615234375,
"text": "传真"
},
{
"top": 490.27032,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.040130615234375,
"text": "0851-22386193"
},
{
"top": 490.27032,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.040130615234375,
"text": "0851-22386193"
}
],
[
{
"top": 504.31046,
"left": 62.325226,
"width": 121.29447937011719,
"height": 14.116485595703125,
"text": "电子信箱"
},
{
"top": 504.31046,
"left": 183.6197,
"width": 175.25013732910157,
"height": 14.116485595703125,
"text": "mtdm@moutaichina.com"
},
{
"top": 504.31046,
"left": 358.86984,
"width": 148.17694091796876,
"height": 14.116485595703125,
"text": "mtdm@moutaichina.com"
}
]
]
},
{
"extraction_method": "lattice",
"top": 565.7921,
"left": 62.325226,
"width": 444.7215576171875,
"height": 90.90570068359375,
"right": 507.04678,
"bottom": 656.6978,
"data": [
[
{
"top": 565.7921,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.1981201171875,
"text": "公司注册地址"
},
{
"top": 565.7921,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.1981201171875,
"text": "贵州省仁怀市茅台镇"
}
],
[
{
"top": 580.99023,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.11981201171875,
"text": "公司注册地址的邮政编码"
},
{
"top": 580.99023,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.11981201171875,
"text": "564501"
}
],
[
{
"top": 596.11005,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.1214599609375,
"text": "公司办公地址"
},
{
"top": 596.11005,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.1214599609375,
"text": "贵州省仁怀市茅台镇"
}
],
[
{
"top": 611.2315,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.14837646484375,
"text": "公司办公地址的邮政编码"
},
{
"top": 611.2315,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.14837646484375,
"text": "564501"
}
],
[
{
"top": 626.3799,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.2398681640625,
"text": "公司网址"
},
{
"top": 626.3799,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.2398681640625,
"text": "http://www.moutaichina.com/"
}
],
[
{
"top": 641.61975,
"left": 62.325226,
"width": 193.49827575683595,
"height": 15.07806396484375,
"text": "电子信箱"
},
{
"top": 641.61975,
"left": 255.8235,
"width": 251.22328186035157,
"height": 15.07806396484375,
"text": "mtdm@moutaichina.com"
}
]
]
},
{
"extraction_method": "lattice",
"top": 704.1823,
"left": 62.325226,
"width": 444.7215576171875,
"height": 45.267578125,
"right": 507.04678,
"bottom": 749.4499,
"data": [
[
{
"top": 704.1823,
"left": 62.325226,
"width": 207.12469482421876,
"height": 15.07757568359375,
"text": "公司选定的信息披露媒体名称"
},
{
"top": 704.1823,
"left": 269.44992,
"width": 237.59686279296876,
"height": 15.07757568359375,
"text": "《中国证券报》《上海证券报》"
}
],
[
{
"top": 719.2599,
"left": 62.325226,
"width": 207.12469482421876,
"height": 15.11956787109375,
"text": "登载年度报告的中国证监会指定网站的网址"
},
{
"top": 719.2599,
"left": 269.44992,
"width": 237.59686279296876,
"height": 15.11956787109375,
"text": "http://www.sse.com.cn/"
}
],
[
{
"top": 734.37946,
"left": 62.325226,
"width": 207.12469482421876,
"height": 15.0704345703125,
"text": "公司年度报告备置地点"
},
{
"top": 734.37946,
"left": 269.44992,
"width": 237.59686279296876,
"height": 15.0704345703125,
"text": "公司董事会办公室"
}
]
]
}
]
需要注意的是:如果表格两边没有框,就解析不了!
3、终极大杀器:pdfbox
3.1 Maven仓库下载导入
在pom.xml中配置maven路径,指定依赖,如下:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.15</version>
</dependency><!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/jempbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.16</version>
</dependency>
3.2 读取PDF中的表格
3.2.1 代码
package com;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
public class PDFUtil {
/**
* 用来读取pdf文件
* @param filePath
* @return
* @throws IOException
*/
public static String readPDF(String filePath) {
String buffer = "";
try{
File input = new File(filePath);
if (input != null && input.exists()) {
PDDocument pd = PDDocument.load(input);
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false);
buffer = stripper.getText(pd);
pd.close();
}else
buffer = "read failed";
}catch (Exception e){
e.printStackTrace();
return "read failed";
}
return buffer;
}
public static String readPDF2(String fileName) {
String result = "";
File file = new File(fileName);
FileInputStream in = null;
try {
in = new FileInputStream(fileName);
// 新建一个PDF解析器对象
PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));
// 对PDF文件进行解析
parser.parse();
// 获取解析后得到的PDF文档对象
PDDocument pdfdocument = parser.getPDDocument();
// 新建一个PDF文本剥离器
PDFTextStripper stripper = new PDFTextStripper();
stripper .setSortByPosition(false); //sort:设置为true 则按照行进行读取,默认是false
// 从PDF文档对象中剥离文本
result = stripper.getText(pdfdocument);
} catch (Exception e) {
System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
e.printStackTrace();
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e1) {
}
}
}
return result;
}
/**
* 测试pdf文件的创建
* @param args
*/
public static void main(String[] args) {
try {
String fileName = "600519_贵州茅台_贵州茅台2019年年度报告";
String filePath = "E:\\文档\\金融数据抽取\\公告\\"+fileName+".pdf"; //这里先手动把绝对路径的文件夹给补上。
PDFUtil pdfUtil = new PDFUtil();
// String result = pdfUtil.readPDF(filePath);
String result = pdfUtil.readPDF2(filePath);
System.out.println(result);
//将提取的表格内容写入txt文档
FileWriter fileWriter = new FileWriter("E:\\文档\\金融数据抽取\\公告\\"+fileName+".txt");
fileWriter.write(result);
fileWriter.flush();
fileWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
3.2.2 表格内容
3.2.3 抽取结果
4、总结
几种方式各有利弊,自己权衡。
更多推荐
所有评论(0)