pdfbox PDF解析票据[不同类型]

meta-one / 2024-10-22 / 原文

在做需求中出现 完税凭证每个地区都出现不一样的格式 这样解析出现了麻烦 

话不多说 直接上代码(有几种格式还没有解析成功(季度)) 通过解析文本获取不到值

 

 1     <dependency>
 2       <groupId>org.apache.pdfbox</groupId>
 3       <artifactId>pdfbox</artifactId>
 4       <version>2.0.24</version>
 5     </dependency>
 6    <dependency>
 7       <groupId>net.sf.cssbox</groupId>
 8       <artifactId>pdf2dom</artifactId>
 9       <version>2.0.1</version>
10     </dependency>
View Code

核心逻辑

import com.google.common.collect.Lists;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.TextMetrics;
import org.w3c.dom.bootstrap.DOMImplementationRegistry;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSOutput;
import org.w3c.dom.ls.LSSerializer;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MyPdf extends PDFDomTree {

    private static List<String> keywordHeadList = Lists.newArrayList();
    private static List<String> keywordBodyList = Lists.newArrayList();

    private static List<String> keywordAllList = Lists.newArrayList();
    private static List<String> textNotList = Lists.newArrayList();

    private List<String> textAllList;

    public List<String> getTextAllList() {
        return textAllList;
    }

    public void setTextAllList(List<String> textAllList) {
        this.textAllList = textAllList;
    }

    public static List<String> getKeywordHeadList() {
        return keywordHeadList;
    }

    public static void setKeywordHeadList(List<String> keywordHeadList) {
        MyPdf.keywordHeadList = keywordHeadList;
    }

    public static List<String> getKeywordBodyList() {
        return keywordBodyList;
    }

    public static void setKeywordBodyList(List<String> keywordBodyList) {
        MyPdf.keywordBodyList = keywordBodyList;
    }

    static {

        keywordHeadList.add("税务机关");
        keywordHeadList.add("填发日期");
        keywordHeadList.add("纳税人识别号");
        keywordHeadList.add("纳税人名称");
        keywordBodyList.add("原凭证号");
        keywordBodyList.add("税种");
        keywordBodyList.add("品目名称");
        keywordBodyList.add("税款所属时期");
        keywordBodyList.add("税款所属期起");
        keywordBodyList.add("税款所属期止");
        keywordBodyList.add("入(退)库日期");
        keywordBodyList.add("入(退)库日期");
        keywordBodyList.add("实缴(退)金额");
        keywordBodyList.add("实缴(退)税额");
        keywordBodyList.add("实缴(退)税额");
        keywordBodyList.add("收款国库");
        keywordAllList.addAll(keywordHeadList);
        keywordAllList.addAll(keywordBodyList);
        textNotList.add("妥");
        textNotList.add("善");
        textNotList.add("保");
        textNotList.add("管");
        textNotList.add("手");
        textNotList.add("写");
        textNotList.add("无");
        textNotList.add("效");
    }

    public MyPdf() throws IOException {
        super();
    }

    protected void startNewPage() {
        super.startNewPage();
    }


    @Override
    protected void renderText(String data, TextMetrics metrics) {
        data = data.replace(":", "").replace(":", "").replace(",", "").replace("¥", "");
        for (String s : textNotList) {
            data = data.replace(s, "");
        }
        if (StringUtils.isNotBlank(data)) {
            textAllList.add(data);
            curpage.appendChild(createTextElement(data, metrics.getWidth()));
        }
    }

    public List<String> parsePdf(PDDocument doc) throws Exception {
        textAllList = Lists.newArrayList();
        DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
        DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
        LSSerializer writer = impl.createLSSerializer();
        LSOutput output = impl.createLSOutput();
        writer.getDomConfig().setParameter("format-pretty-print", true);
        createDOM(doc);
        return textAllList;
    }

    public List<String> parsePdf(File file) throws Exception {
        PDDocument document = PDDocument.load(file);
        List<String> strings = parsePdf(document);
        document.close();
        return strings;

    }

    public List<String> parsePdf(FileInputStream fileInputStream) throws Exception {
        PDDocument document = PDDocument.load(fileInputStream);
        List<String> strings = parsePdf(document);
        document.close();
        return strings;
    }

    public static TaxInvoicePdfDto parsePdfText(List<String> textList) {
        try {
            TaxInvoicePdfDto taxInvoicePdfDto = new TaxInvoicePdfDto();
            List<TaxInvoicePdfDetail> detailList = Lists.newArrayList();
            for (String keyword : keywordHeadList) {
                LoopTextVo loopTextVo = new LoopTextVo(textList, keyword);
                LoopTextVo s = loopTextHeadList(loopTextVo);
                if (keyword.equals("税务机关")) {
                    taxInvoicePdfDto.setNatTaxAuth(s.getValue());
                }
                if (keyword.equals("纳税人名称")) {
                    taxInvoicePdfDto.setTaxpayerName(s.getValue());
                }
                if (keyword.equals("纳税人识别号")) {
                    taxInvoicePdfDto.setTaxpayerNumber(s.getValue());
                }
            }
            List<String> bodyHeaderList = Lists.newArrayList();
            int loopCount = 0;
            for (String keyword : keywordBodyList) {
                LoopTextVo loopTextVo = new LoopTextVo(textList, keyword);
                loopTextVo = loopTextBodyList(loopTextVo);
                bodyHeaderList.addAll(loopTextVo.getBodyHeaderList());
                loopCount = Math.max(loopTextVo.getLoopCount(), loopCount);
            }
            String s = bodyHeaderList.get(bodyHeaderList.size() - 1);
            LoopTextVo loopTextVo = new LoopTextVo(textList, s);
            loopTextVo = loopTextBodyList(loopTextVo);
            List<String> bodyDataList = textList.subList(loopTextVo.getLoopCount() + 1, loopCount);

            String join = StringUtils.join(bodyDataList, " ");
            String datePattern = " 至 ";
            String datePattern1 = "\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}- \\d{2}";
            Pattern pattern = Pattern.compile(datePattern);
            Pattern pattern1 = Pattern.compile(datePattern1);
            Matcher matcher = pattern.matcher(join);
            Matcher matcher1 = pattern1.matcher(join);
            int index = 0;
            if (matcher.find()) {
                index += 2;
            }
            String[] s2 = join.split(" ");
            ArrayList<String> strings = Lists.newArrayList(s2);
            if (matcher1.find()) {
                join = join.replaceAll("至(\\d{4}-\\d{2}-) (\\d{2})", "至$1$2");
                s2 = join.split(" ");
                strings = Lists.newArrayList(s2);
                strings.remove(s2.length - 1);
            }

            List<List<String>> partition = Lists.partition(strings, bodyHeaderList.size() + index);
            for (List<String> stringList : partition) {
                if(stringList.size() < bodyHeaderList.size()){
                    continue;
                }
                TaxInvoicePdfDetail taxInvoicePdfDetail = new TaxInvoicePdfDetail();
                int index1 = 0;
                boolean flag = false;
                for (int i = 0; i < bodyHeaderList.size(); i++) {
                     if(stringList.size() -2 == bodyHeaderList.size()) {
                        if (bodyHeaderList.get(i).equals("税种")) {
                            taxInvoicePdfDetail.setTaxCategory(stringList.get(i));
                        }
                        if (bodyHeaderList.get(i).equals("品目名称")) {
                            taxInvoicePdfDetail.setTaxItems(stringList.get(i));
                        }
                        if (Lists.newArrayList("税款所属时期").contains(bodyHeaderList.get(i))) {
                            flag = true;
                            String s1 = stringList.get(i) + stringList.get(i + 1) + stringList.get(index1 + 2);
                            taxInvoicePdfDetail.setLegalizeBelongDateFrom(s1.split("至")[0]);
                            taxInvoicePdfDetail.setLegalizeBelongDateTo(s1.split("至")[1]);
                        }
                        if (Lists.newArrayList("税款所属时期", "税款所属期起").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i));
                        }
                        if (Lists.newArrayList("税款所属时期", "税款所属期止").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i));
                        }
                        if (Lists.newArrayList("入(退)库日期", "入(退)库日期").contains(bodyHeaderList.get(i))) {
                            index1 = flag ? i + 2 : i;
                            taxInvoicePdfDetail.setPaymentDate(stringList.get(index1));
                        }
                        if (Lists.newArrayList("实缴(退)金额", "实缴(退)税额", "实缴(退)税额").contains(bodyHeaderList.get(i))) {
                            index1 = flag ? i + 2 : i;
                            taxInvoicePdfDetail.setTotalAmount(new BigDecimal(stringList.get(index1)));
                        }
                    }else{
                        if (bodyHeaderList.get(i).equals("税种")) {
                            taxInvoicePdfDetail.setTaxCategory(stringList.get(i));
                        }
                        if (bodyHeaderList.get(i).equals("品目名称")) {
                            taxInvoicePdfDetail.setTaxItems(stringList.get(i));
                        }
                        if (Lists.newArrayList("税款所属时期").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i).split("至")[0]);
                            taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i).split("至")[1]);
                        }
                        if (Lists.newArrayList("税款所属期起").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setLegalizeBelongDateFrom(stringList.get(i));
                        }
                        if (Lists.newArrayList("税款所属期止").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setLegalizeBelongDateTo(stringList.get(i));
                        }
                        if (Lists.newArrayList("入(退)库日期", "入(退)库日期").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setPaymentDate(stringList.get(i));
                        }
                        if (Lists.newArrayList("实缴(退)金额", "实缴(退)税额", "实缴(退)税额").contains(bodyHeaderList.get(i))) {
                            taxInvoicePdfDetail.setTotalAmount(new BigDecimal(stringList.get(i)));
                        }
                    }
                }
                detailList.add(taxInvoicePdfDetail);
            }
            taxInvoicePdfDto.setDetailList(detailList);
            return taxInvoicePdfDto;
        } catch (Exception e) {
           e.printStackTrace();
            return null;
        }
    }

    public static LoopTextVo loopTextHeadList(LoopTextVo loopTextVo) {
        int loopCount = loopTextVo.getLoopCount();
        List<String> textList = loopTextVo.getTextList();
        String value = loopTextVo.getValue();
        String header = loopTextVo.getHeader();
        String keyword = loopTextVo.getKeyword();
        boolean flag = loopTextVo.isFlag();
        if (loopCount == textList.size()) {
            return loopTextVo;
        }
        String s = textList.get(loopCount);
        if (s.equals("备注")) {
            return loopTextVo;
        }
        if (flag) {
            for (String keywordHead : keywordAllList) {
                if (keywordHead.startsWith(s)) {
                    return loopTextVo;
                }
            }
            loopTextVo.setValue(value + s);
        }
        if (keyword.startsWith(s) || (StringUtils.isNotBlank(header) && keyword.startsWith(header))) {
            header = header + s;
            if (StringUtils.isNotBlank(value)) {
                loopTextVo.setValue(value);
                return loopTextVo;
            }
            boolean equals = keyword.equals(header);
            loopTextVo.setFlag(equals);
            loopTextVo.setLoopCount(loopCount + 1);
            loopTextVo.setHeader(equals ? "" : header);
            return loopTextHeadList(loopTextVo);
        } else if (!flag) {
            loopTextVo.setHeader("");
        }
        if (s.contains(keyword)) {
            loopTextVo.setValue(s.replace(keyword, ""));
            return loopTextVo;
        }
        loopTextVo.setLoopCount(loopCount + 1);
        return loopTextHeadList(loopTextVo);
    }

    public static LoopTextVo loopTextBodyList(LoopTextVo loopTextVo) {
        int loopCount = loopTextVo.getLoopCount();
        List<String> textList = loopTextVo.getTextList();
        String header = loopTextVo.getHeader();
        String keyword = loopTextVo.getKeyword();
        List<String> bodyHeaderList = loopTextVo.getBodyHeaderList();

        if (loopCount == textList.size()) {
            return loopTextVo;
        }
        String s = textList.get(loopCount);
        if (s.startsWith("以上情况") || s.startsWith("金额合计")) {
            if(s.startsWith("金额合计") && textList.size()/loopCount > 2){
                loopTextVo.setLoopCount(loopCount + 1);
                return loopTextBodyList(loopTextVo);
            }
            textList.subList(0, loopCount);
            loopTextVo.setTextList(textList);
            return loopTextVo;
        }
        if (keyword.startsWith(s) || (StringUtils.isNotBlank(header) && keyword.startsWith(header))) {
            header = header + s;
            if (keyword.equals(header)) {
                bodyHeaderList.add(header);
                return loopTextVo;
            }
            loopTextVo.setLoopCount(loopCount + 1);
            loopTextVo.setHeader(header);
            return loopTextBodyList(loopTextVo);
        } else {
            loopTextVo.setHeader("");
        }
        if (s.contains(keyword)) {
            loopTextVo.setValue(s.replace(keyword, ""));
            return loopTextVo;
        }
        loopTextVo.setLoopCount(loopCount + 1);
        return loopTextBodyList(loopTextVo);
    }
}
View Code

VO:

@Data
public class LoopTextVo {

    LoopTextVo(List<String> textList,String keyword){
        this.textList = textList;
        this.keyword = keyword;
    }
    private List<String> textList;

    private List<String> bodyHeaderList = new ArrayList<>();
    private int LoopCount;
    private String keyword = "";
    private String header = "";
    private String value = "";
    private boolean flag = false;
}
View Code

封装对象:

@Data
public class TaxInvoicePdfDto {

    private String natTaxAuth;

    private String taxpayerNumber;

    private String taxpayerName;

    private List<TaxInvoicePdfDetail> detailList;

}

====两个类 分开写 这里就直接扔进去了
@Data
public class TaxInvoicePdfDetail {

    /**
     * 税种
     */
    private String taxCategory;

    /**
     * 品 目 名 称
     */
    private String taxItems;
    /**
     * 税款所属起
     */
    private String legalizeBelongDateFrom;
    /**
     * 税款所属止
     */
    private String legalizeBelongDateTo;

    /**
     * 入(退)库日期
     */
    private String paymentDate;
    /**
     * 实缴(退)金额
     */
    private BigDecimal totalAmount;
}
View Code

 

测试类:

 public static void main(String[] args) {

        try {
            File[] files = new File("D:\\pdf\\").listFiles(pathname -> {
                if (pathname.isDirectory()) {
                    return false;
                }
                return true;
            });
            for (File file : files) {
                System.out.println(file.getName());
                MyPdf pdfDomTree = new MyPdf();
                List<String> textList = pdfDomTree.parsePdf(file);
                TaxInvoicePdfDto taxInvoicePdfDto = pdfDomTree.parsePdfText(textList);
                System.out.println(taxInvoicePdfDto);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }