|
@@ -2,6 +2,7 @@ package org.jeecg.modules.saleCode.util;
|
|
|
|
|
|
import com.alibaba.fastjson.JSONArray;
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.sun.org.apache.xpath.internal.operations.Bool;
|
|
|
import lombok.AllArgsConstructor;
|
|
|
import lombok.Getter;
|
|
|
import org.apache.logging.log4j.util.Strings;
|
|
@@ -89,6 +90,224 @@ public class PDFTableReader<T> {
|
|
|
return new PDFTableReader<T>(pdfFile, beanClass);
|
|
|
}
|
|
|
|
|
|
+ public static PdfTable initTable(MultipartFile file, String standardX) {
|
|
|
+ PdfTable pdfTable = new PdfTable();
|
|
|
+ Map<Float, PdfRow> pdfRowMap = new HashMap<>();
|
|
|
+ try {
|
|
|
+ InputStream inputStream = file.getInputStream();
|
|
|
+ PDDocument document = PDDocument.load(inputStream);
|
|
|
+ Map<Float, PdfRow> headerRows = new HashMap<>();
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ private int pageNumber = 0;
|
|
|
+ float width = 0;
|
|
|
+ float height = 0;
|
|
|
+ float standardXPosition = 0;
|
|
|
+ float rowY = 0;
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected void writePage() throws IOException {
|
|
|
+ pageNumber++;
|
|
|
+ PDPage page = document.getPage(pageNumber - 1);
|
|
|
+ width = page.getMediaBox().getWidth();
|
|
|
+ height = page.getMediaBox().getHeight();
|
|
|
+ pdfTable.setWidth(width);
|
|
|
+ pdfTable.setHeight(height);
|
|
|
+ super.writePage();
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ //设置左侧空白偏移量
|
|
|
+ if (text.contains(standardX)) {
|
|
|
+ pdfTable.setStandardPosition(textPositions.get(0).getX());
|
|
|
+ }
|
|
|
+
|
|
|
+ //初始化每个字段的行高,宽度,对于间隔超过3个字符宽度的字段自动拆分重组为新字段(x-前一个字段的endx值大于3的,默认为新字段)
|
|
|
+ //上下高度小于3的可视为一行
|
|
|
+ List<PdfTextPosition> cellPosition = new ArrayList<>();
|
|
|
+ Float preCharEndX = null;
|
|
|
+ PdfRow pdfRow;
|
|
|
+ StringBuffer textBuffer = new StringBuffer();
|
|
|
+ for(int i = 0; i < textPositions.size(); i ++) {
|
|
|
+ PdfTextPosition position = new PdfTextPosition();
|
|
|
+ TextPosition textPosition = textPositions.get(i);
|
|
|
+ position.setX(textPosition.getX());
|
|
|
+ position.setEndX(textPosition.getEndX());
|
|
|
+ position.setY(textPosition.getY() + (pageNumber - 1) * height);
|
|
|
+ position.setPageNum(pageNumber);
|
|
|
+ position.setFont(textPosition.getFont());
|
|
|
+ position.setFontSize(textPosition.getFontSize());
|
|
|
+ position.setUnicode(textPosition.getUnicode());
|
|
|
+ if(preCharEndX != null) {
|
|
|
+ //超过3个字符宽度的字段自动拆分重组为新字段
|
|
|
+ if(position.getX() - preCharEndX > 3) {
|
|
|
+ PdfCell pdfCell = new PdfCell();
|
|
|
+ pdfCell.setPositions(cellPosition);
|
|
|
+ pdfCell.setCellY(cellPosition.get(0).getY());
|
|
|
+ pdfCell.setText(String.valueOf(textBuffer));
|
|
|
+ pdfCell.setCellStartX(cellPosition.get(0).getX());
|
|
|
+ pdfCell.setCellEndX(cellPosition.get(cellPosition.size() - 1).getEndX());
|
|
|
+ pdfRow = pdfRowMap.get(pdfCell.getCellY()) == null ? new PdfRow() : pdfRowMap.get(pdfCell.getCellY());
|
|
|
+ List<PdfCell> cell = pdfRow.getCell() == null ? new ArrayList<>() : pdfRow.getCell();
|
|
|
+ cell.add(pdfCell);
|
|
|
+ pdfRow.setCell(cell);
|
|
|
+ pdfRowMap.put(pdfCell.getCellY(), pdfRow);
|
|
|
+
|
|
|
+ textBuffer = new StringBuffer();
|
|
|
+ cellPosition = new ArrayList<>();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ textBuffer.append(textPosition.getUnicode());
|
|
|
+ cellPosition.add(position);
|
|
|
+ preCharEndX = position.getEndX();
|
|
|
+ if(i == textPositions.size() - 1) {
|
|
|
+ PdfCell pdfCell = new PdfCell();
|
|
|
+ pdfCell.setPositions(cellPosition);
|
|
|
+ pdfCell.setCellY(cellPosition.get(0).getY());
|
|
|
+ pdfCell.setText(String.valueOf(textBuffer));
|
|
|
+ pdfCell.setCellStartX(cellPosition.get(0).getX());
|
|
|
+ pdfCell.setCellEndX(cellPosition.get(cellPosition.size() - 1).getEndX());
|
|
|
+ pdfRow = pdfRowMap.get(pdfCell.getCellY()) == null ? new PdfRow() : pdfRowMap.get(pdfCell.getCellY());
|
|
|
+ List<PdfCell> cell = pdfRow.getCell() == null ? new ArrayList<>() : pdfRow.getCell();
|
|
|
+ cell.add(pdfCell);
|
|
|
+ pdfRow.setCell(cell);
|
|
|
+ pdfRowMap.put(pdfCell.getCellY(), pdfRow);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(true);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+ textStripper.getText(document);
|
|
|
+ document.close();
|
|
|
+ inputStream.close();
|
|
|
+ pdfTable.setPdfRows(pdfRowMap);
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ return pdfTable;
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ public static PdfTable handlePdf(MultipartFile file, String standardX) {
|
|
|
+ PdfTable pdfTable = initTable(file, standardX);
|
|
|
+ //行高度重新校准,对于行高度差距小于2的,自动将下一行合并至上一行
|
|
|
+ adjustY(pdfTable);
|
|
|
+ return pdfTable;
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ public static String getLineByTitle(MultipartFile file, String standardX, String title, String afterTitle, String nextTitle) {
|
|
|
+ PdfTable pdfTable = handlePdf(file, standardX);
|
|
|
+ Map<Float, PdfRow> pdfRows = pdfTable.getPdfRows();
|
|
|
+ Float titleRow = Float.NaN;
|
|
|
+ Float nextRow = null;
|
|
|
+ Float titleStartX = null;
|
|
|
+ Float titleEndX = null;
|
|
|
+ Float afterTitleX = null;
|
|
|
+ for(Float key : pdfRows.keySet()) {
|
|
|
+ //确定title所在行
|
|
|
+ PdfRow pdfRow = pdfRows.get(key);
|
|
|
+ if(pdfRow == null) continue;
|
|
|
+ List<PdfCell> pdfCells = pdfRow.getCell();
|
|
|
+ for(PdfCell pdfCell : pdfCells) {
|
|
|
+ if(pdfCell.getText().contains(title)) {
|
|
|
+ titleRow = key;
|
|
|
+ titleStartX = pdfCell.getCellStartX();
|
|
|
+ }
|
|
|
+ //如果是多行,则获取截取行所在行高度
|
|
|
+ if(nextTitle != null) {
|
|
|
+ if(pdfCell.getText().contains(nextTitle)) {
|
|
|
+ nextRow = key;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //如果需要获取的内容后有其他干扰title,则需要加上aftertitle辨识,只获取aftertitle之前的内容
|
|
|
+ //获取aftertitle的起始X
|
|
|
+ if(afterTitle != null) {
|
|
|
+ if(pdfCell.getText().contains(afterTitle)) {
|
|
|
+ afterTitleX = pdfCell.getCellStartX();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //根据titleRow和nextRow获取多行数据
|
|
|
+ List<Float> collect = new ArrayList<>();
|
|
|
+ if(nextRow != null) {
|
|
|
+ Float finalTitleRow = titleRow;
|
|
|
+ Float finalNextRow = nextRow;
|
|
|
+ collect = pdfRows.keySet().stream().filter(item -> {
|
|
|
+ return item >= finalTitleRow && item < finalNextRow;
|
|
|
+ }).sorted().collect(Collectors.toList());
|
|
|
+ }else {
|
|
|
+ collect.add(titleRow);
|
|
|
+ }
|
|
|
+
|
|
|
+ StringBuffer resultBuffer = new StringBuffer();
|
|
|
+ for(Float row : collect) {
|
|
|
+ PdfRow pdfRow = pdfRows.get(row);
|
|
|
+ if(pdfRow == null) continue;
|
|
|
+ List<PdfCell> cells = pdfRow.getCell();
|
|
|
+ for(PdfCell cell : cells) {
|
|
|
+ //如果存在右侧隔绝列,则将每行小于此隔绝列值的数据按顺序添加,否则就添加此行所有在选定标题起始X之后的数据
|
|
|
+ if(afterTitleX == null) {
|
|
|
+ //设置绝对大值,使每个单元格都能被包裹
|
|
|
+ afterTitleX = Float.valueOf("5000");
|
|
|
+ }
|
|
|
+ if(cell.getCellStartX() >= titleStartX && cell.getCellEndX() <= afterTitleX) {
|
|
|
+ resultBuffer.append(cell.getText());
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return String.valueOf(resultBuffer).replaceAll(title, "").trim();
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ //确定内容行和最低限制行后,如果需要获取的内容后有其他干扰title,则需要加上aftertitle辨识,只获取aftertitle之前的内容
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void adjustY(PdfTable pdfTable) {
|
|
|
+ Map<Float, PdfRow> pdfRows = pdfTable.getPdfRows();
|
|
|
+ Set<Float> keys = pdfRows.keySet();
|
|
|
+ List<Float> sortKeys = keys.stream().sorted().collect(Collectors.toList());
|
|
|
+ Float preY = null;
|
|
|
+ Boolean isAdjust = false;
|
|
|
+ for(Float key : sortKeys) {
|
|
|
+ isAdjust = false;
|
|
|
+ if(preY == null) {
|
|
|
+ preY = key;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if(key - preY > 0 && key - preY < 2) {
|
|
|
+ List<PdfCell> cell1 = pdfRows.get(preY).getCell();
|
|
|
+ List<PdfCell> cell2 = pdfRows.get(key).getCell();
|
|
|
+ cell1.addAll(cell2);
|
|
|
+ pdfRows.put(key, null);
|
|
|
+ cell1.sort(new Comparator<PdfCell>() {
|
|
|
+ @Override
|
|
|
+ public int compare(PdfCell o1, PdfCell o2) {
|
|
|
+ return (int) (o1.getCellStartX() - o2.getCellStartX());
|
|
|
+ }
|
|
|
+ });
|
|
|
+ PdfRow pdfRow = pdfRows.get(preY);
|
|
|
+ pdfRow.setCell(cell1);
|
|
|
+ isAdjust = true;
|
|
|
+ }
|
|
|
+ if(!isAdjust) {
|
|
|
+ preY = key;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pdfTable.setPdfRows(pdfRows);
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 设置PDF表格标题所在行号
|
|
|
*
|