pdfbox PDF解析票据[不同类型]
在做需求中出现 完税凭证每个地区都出现不一样的格式 这样解析出现了麻烦
话不多说 直接上代码(有几种格式还没有解析成功(季度)) 通过解析文本获取不到值

1 <dependency> 2 <groupId>org.apache.pdfbox</groupId> 3 <artifactId>pdfbox</artifactId> 4 <version>2.0.24</version> 5 </dependency> 6 <dependency> 7 <groupId>net.sf.cssbox</groupId> 8 <artifactId>pdf2dom</artifactId> 9 <version>2.0.1</version> 10 </dependency>
核心逻辑

import com.google.common.collect.Lists; import org.apache.commons.lang3.StringUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.fit.pdfdom.PDFDomTree; import org.fit.pdfdom.TextMetrics; import org.w3c.dom.bootstrap.DOMImplementationRegistry; import org.w3c.dom.ls.DOMImplementationLS; import org.w3c.dom.ls.LSOutput; import org.w3c.dom.ls.LSSerializer; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.math.BigDecimal; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MyPdf extends PDFDomTree { private static List<String> keywordHeadList = Lists.newArrayList(); private static List<String> keywordBodyList = Lists.newArrayList(); private static List<String> keywordAllList = Lists.newArrayList(); private static List<String> textNotList = Lists.newArrayList(); private List<String> textAllList; public List<String> getTextAllList() { return textAllList; } public void setTextAllList(List<String> textAllList) { this.textAllList = textAllList; } public static List<String> getKeywordHeadList() { return keywordHeadList; } public static void setKeywordHeadList(List<String> keywordHeadList) { MyPdf.keywordHeadList = keywordHeadList; } public static List<String> getKeywordBodyList() { return keywordBodyList; } public static void setKeywordBodyList(List<String> keywordBodyList) { MyPdf.keywordBodyList = keywordBodyList; } static { keywordHeadList.add("税务机关"); keywordHeadList.add("填发日期"); keywordHeadList.add("纳税人识别号"); keywordHeadList.add("纳税人名称"); keywordBodyList.add("原凭证号"); keywordBodyList.add("税种"); keywordBodyList.add("品目名称"); keywordBodyList.add("税款所属时期"); keywordBodyList.add("税款所属期起"); keywordBodyList.add("税款所属期止"); keywordBodyList.add("入(退)库日期"); keywordBodyList.add("入(退)库日期"); keywordBodyList.add("实缴(退)金额"); keywordBodyList.add("实缴(退)税额"); keywordBodyList.add("实缴(退)税额"); keywordBodyList.add("收款国库"); keywordAllList.addAll(keywordHeadList); keywordAllList.addAll(keywordBodyList); textNotList.add("妥"); textNotList.add("善"); textNotList.add("保"); textNotList.add("管"); textNotList.add("手"); textNotList.add("写"); textNotList.add("无"); textNotList.add("效"); } public MyPdf() throws IOException { super(); } protected void startNewPage() { super.startNewPage(); } @Override protected void renderText(String data, TextMetrics metrics) { data = data.replace(":", "").replace(":", "").replace(",", "").replace("¥", ""); for (String s : textNotList) { data = data.replace(s, ""); } if (StringUtils.isNotBlank(data)) { textAllList.add(data); curpage.appendChild(createTextElement(data, metrics.getWidth())); } } public List<String> parsePdf(PDDocument doc) throws Exception { textAllList = Lists.newArrayList(); DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance(); DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS"); LSSerializer writer = impl.createLSSerializer(); LSOutput output = impl.createLSOutput(); writer.getDomConfig().setParameter("format-pretty-print", true); createDOM(doc); return textAllList; } public List<String> parsePdf(File file) throws Exception { PDDocument document = PDDocument.load(file); List<String> strings = parsePdf(document); document.close(); return strings; } public List<String> parsePdf(FileInputStream fileInputStream) throws Exception { PDDocument document = PDDocument.load(fileInputStream); List<String> strings = parsePdf(document); document.close(); return strings; } public static TaxInvoicePdfDto parsePdfText(List<String> textList) { try { TaxInvoicePdfDto taxInvoicePdfDto = new TaxInvoicePdfDto(); List<TaxInvoicePdfDetail> detailList = Lists.newArrayList(); for (String keyword : keywordHeadList) { LoopTextVo loopTextVo = new LoopTextVo(textList, keyword); LoopTextVo s = loopTextHeadList(loopTextVo); if (keyword.equals("税务机关")) { taxInvoicePdfDto.setNatTaxAuth(s.getValue()); } if (keyword.equals("纳税人名称")) { taxInvoicePdfDto.setTaxpayerName(s.getValue()); } if (keyword.equals("纳税人识别号")) { taxInvoicePdfDto.setTaxpayerNumber(s.getValue()); } } List<String> bodyHeaderList = Lists.newArrayList(); int loopCount = 0; for (String keyword : keywordBodyList) { LoopTextVo loopTextVo = new LoopTextVo(textList, keyword); loopTextVo = loopTextBodyList(loopTextVo); bodyHeaderList.addAll(loopTextVo.getBodyHeaderList()); loopCount = Math.max(loopTextVo.getLoopCount(), loopCount); } String s = bodyHeaderList.get(bodyHeaderList.size() - 1); LoopTextVo loopTextVo = new LoopTextVo(textList, s); loopTextVo = loopTextBodyList(loopTextVo); List<String> bodyDataList = textList.subList(loopTextVo.getLoopCount() + 1, loopCount); String join = StringUtils.join(bodyDataList, " "); String datePattern = " 至 "; String datePattern1 = "\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}- \\d{2}"; Pattern pattern = Pattern.compile(datePattern); Pattern pattern1 = Pattern.compile(datePattern1); Matcher matcher = pattern.matcher(join); Matcher matcher1 = pattern1.matcher(join); int index = 0; if (matcher.find()) { index += 2; } String[] s2 = join.split(" "); ArrayList<String> strings = Lists.newArrayList(s2); if (matcher1.find()) { join = join.replaceAll("至(\\d{4}-\\d{2}-) (\\d{2})", "至$1$2"); s2 = join.split(" "); strings = Lists.newArrayList(s2); strings.remove(s2.length - 1); } List<List<String>> partition = Lists.partition(strings, bodyHeaderList.size() + index); for (List<String> stringList : partition) { if(stringList.size() < bodyHeaderList.size()){ continue; } TaxInvoicePdfDetail taxInvoicePdfDetail = new TaxInvoicePdfDetail(); int index1 = 0; boolean flag = false; for (int i = 0; i < bodyHeaderList.size(); i++) { if(stringList.size() -2 == bodyHeaderList.size()) { if (bodyHeaderList.get(i).equals("税种")) { taxInvoicePdfDetail.setTaxCategory(stringList.get(i)); } if (bodyHeaderList.get(i).equals("品目名称")) { taxInvoicePdfDetail.setTaxItems(stringList.get(i)); } if (Lists.newArrayList("税款所属时期").contains(bodyHeaderList.get(i))) { flag = true; String s1 = stringList.get(i) + stringList.get(i + 1) + stringList.get(index1 + 2); taxInvoicePdfDetail.setLegalizeBelongDateFrom(s1.split("至")[0]); taxInvoicePdfDetail.setLegalizeBelongDateTo(s1.split("至")[1]); } if (Lists.newArrayList("税款所属时期", "税款所属期起").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i)); } if (Lists.newArrayList("税款所属时期", "税款所属期止").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i)); } if (Lists.newArrayList("入(退)库日期", "入(退)库日期").contains(bodyHeaderList.get(i))) { index1 = flag ? i + 2 : i; taxInvoicePdfDetail.setPaymentDate(stringList.get(index1)); } if (Lists.newArrayList("实缴(退)金额", "实缴(退)税额", "实缴(退)税额").contains(bodyHeaderList.get(i))) { index1 = flag ? i + 2 : i; taxInvoicePdfDetail.setTotalAmount(new BigDecimal(stringList.get(index1))); } }else{ if (bodyHeaderList.get(i).equals("税种")) { taxInvoicePdfDetail.setTaxCategory(stringList.get(i)); } if (bodyHeaderList.get(i).equals("品目名称")) { taxInvoicePdfDetail.setTaxItems(stringList.get(i)); } if (Lists.newArrayList("税款所属时期").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i).split("至")[0]); taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i).split("至")[1]); } if (Lists.newArrayList("税款所属期起").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i)); } if (Lists.newArrayList("税款所属期止").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i)); } if (Lists.newArrayList("入(退)库日期", "入(退)库日期").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setPaymentDate(stringList.get(i)); } if (Lists.newArrayList("实缴(退)金额", "实缴(退)税额", "实缴(退)税额").contains(bodyHeaderList.get(i))) { taxInvoicePdfDetail.setTotalAmount(new BigDecimal(stringList.get(i))); } } } detailList.add(taxInvoicePdfDetail); } taxInvoicePdfDto.setDetailList(detailList); return taxInvoicePdfDto; } catch (Exception e) { e.printStackTrace(); return null; } } public static LoopTextVo loopTextHeadList(LoopTextVo loopTextVo) { int loopCount = loopTextVo.getLoopCount(); List<String> textList = loopTextVo.getTextList(); String value = loopTextVo.getValue(); String header = loopTextVo.getHeader(); String keyword = loopTextVo.getKeyword(); boolean flag = loopTextVo.isFlag(); if (loopCount == textList.size()) { return loopTextVo; } String s = textList.get(loopCount); if (s.equals("备注")) { return loopTextVo; } if (flag) { for (String keywordHead : keywordAllList) { if (keywordHead.startsWith(s)) { return loopTextVo; } } loopTextVo.setValue(value + s); } if (keyword.startsWith(s) || (StringUtils.isNotBlank(header) && keyword.startsWith(header))) { header = header + s; if (StringUtils.isNotBlank(value)) { loopTextVo.setValue(value); return loopTextVo; } boolean equals = keyword.equals(header); loopTextVo.setFlag(equals); loopTextVo.setLoopCount(loopCount + 1); loopTextVo.setHeader(equals ? "" : header); return loopTextHeadList(loopTextVo); } else if (!flag) { loopTextVo.setHeader(""); } if (s.contains(keyword)) { loopTextVo.setValue(s.replace(keyword, "")); return loopTextVo; } loopTextVo.setLoopCount(loopCount + 1); return loopTextHeadList(loopTextVo); } public static LoopTextVo loopTextBodyList(LoopTextVo loopTextVo) { int loopCount = loopTextVo.getLoopCount(); List<String> textList = loopTextVo.getTextList(); String header = loopTextVo.getHeader(); String keyword = loopTextVo.getKeyword(); List<String> bodyHeaderList = loopTextVo.getBodyHeaderList(); if (loopCount == textList.size()) { return loopTextVo; } String s = textList.get(loopCount); if (s.startsWith("以上情况") || s.startsWith("金额合计")) { if(s.startsWith("金额合计") && textList.size()/loopCount > 2){ loopTextVo.setLoopCount(loopCount + 1); return loopTextBodyList(loopTextVo); } textList.subList(0, loopCount); loopTextVo.setTextList(textList); return loopTextVo; } if (keyword.startsWith(s) || (StringUtils.isNotBlank(header) && keyword.startsWith(header))) { header = header + s; if (keyword.equals(header)) { bodyHeaderList.add(header); return loopTextVo; } loopTextVo.setLoopCount(loopCount + 1); loopTextVo.setHeader(header); return loopTextBodyList(loopTextVo); } else { loopTextVo.setHeader(""); } if (s.contains(keyword)) { loopTextVo.setValue(s.replace(keyword, "")); return loopTextVo; } loopTextVo.setLoopCount(loopCount + 1); return loopTextBodyList(loopTextVo); } }
VO:

@Data public class LoopTextVo { LoopTextVo(List<String> textList,String keyword){ this.textList = textList; this.keyword = keyword; } private List<String> textList; private List<String> bodyHeaderList = new ArrayList<>(); private int LoopCount; private String keyword = ""; private String header = ""; private String value = ""; private boolean flag = false; }
封装对象:

@Data public class TaxInvoicePdfDto { private String natTaxAuth; private String taxpayerNumber; private String taxpayerName; private List<TaxInvoicePdfDetail> detailList; } ====两个类 分开写 这里就直接扔进去了 @Data public class TaxInvoicePdfDetail { /** * 税种 */ private String taxCategory; /** * 品 目 名 称 */ private String taxItems; /** * 税款所属起 */ private String legalizeBelongDateFrom; /** * 税款所属止 */ private String legalizeBelongDateTo; /** * 入(退)库日期 */ private String paymentDate; /** * 实缴(退)金额 */ private BigDecimal totalAmount; }
测试类:
public static void main(String[] args) { try { File[] files = new File("D:\\pdf\\").listFiles(pathname -> { if (pathname.isDirectory()) { return false; } return true; }); for (File file : files) { System.out.println(file.getName()); MyPdf pdfDomTree = new MyPdf(); List<String> textList = pdfDomTree.parsePdf(file); TaxInvoicePdfDto taxInvoicePdfDto = pdfDomTree.parsePdfText(textList); System.out.println(taxInvoicePdfDto); } } catch (Exception e) { e.printStackTrace(); } }