|
@@ -28,6 +28,10 @@ import java.util.stream.Collectors;
|
|
|
public class PDFTableReader<T> {
|
|
|
// 换行符
|
|
|
private final static String LINE_WRAP = "\r";
|
|
|
+ private final static String normal = "normal";
|
|
|
+ private final static String tableHead = "tableHead";
|
|
|
+ private final static String tableLine = "tableLine";
|
|
|
+ private final static String tableExtra = "tableExtra";
|
|
|
/**
|
|
|
* PDF数据缓冲器
|
|
|
*/
|
|
@@ -331,7 +335,74 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
|
|
|
|
|
|
- public static List<String> getMultipleLineFields(MultipartFile file, String lineStartText, String lineEndText, boolean sort) {
|
|
|
+
|
|
|
+ public static List<String> getFields(MultipartFile file, String startText, String endText, String cutText, boolean sort) {
|
|
|
+ //获取文档坐标
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ InputStream inputStream = null;
|
|
|
+ PDDocument document = null;
|
|
|
+ try {
|
|
|
+ inputStream = file.getInputStream();
|
|
|
+ document = PDDocument.load(inputStream);
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ if (text.contains(startText)) {
|
|
|
+ int startIndex = 0;
|
|
|
+ int endIndex = text.length();
|
|
|
+ if (Strings.isNotBlank(startText)) {
|
|
|
+ startIndex = text.indexOf(startText);
|
|
|
+ }
|
|
|
+ if (Strings.isNotBlank(endText)) {
|
|
|
+ endIndex = text.indexOf(endText, startIndex + startText.length());
|
|
|
+ }
|
|
|
+ if (startIndex == -1) {
|
|
|
+ startIndex = 0;
|
|
|
+ }
|
|
|
+ if (endIndex == -1) {
|
|
|
+ endIndex = text.length();
|
|
|
+ }
|
|
|
+ result.add(text.substring(startIndex + startText.length(), endIndex).trim());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(sort);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+
|
|
|
+ textStripper.getText(document);
|
|
|
+
|
|
|
+ document.close();
|
|
|
+
|
|
|
+
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ } finally {
|
|
|
+ if (document != null) {
|
|
|
+ try {
|
|
|
+ document.close();
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (inputStream != null) {
|
|
|
+ try {
|
|
|
+ inputStream.close();
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ if (result == null || result.size() == 0) {
|
|
|
+ result.add(" ");
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ public static List<String> getMultipleLineFields(MultipartFile file, String lineStartText, String lineEndText, String[] extraLines, boolean sort) {
|
|
|
//获取文档坐标
|
|
|
List<String> result = new ArrayList<>();
|
|
|
final boolean[] startRecord = {false};
|
|
@@ -343,6 +414,7 @@ public class PDFTableReader<T> {
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ boolean isRemove = false;
|
|
|
if (text.contains(lineStartText)) {
|
|
|
startRecord[0] = true;
|
|
|
}
|
|
@@ -353,7 +425,14 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
}
|
|
|
if (startRecord[0]) {
|
|
|
- result.add(text.trim());
|
|
|
+ for(String line : extraLines) {
|
|
|
+ if(text.contains(line)) {
|
|
|
+ isRemove = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(!isRemove) {
|
|
|
+ result.add(text.trim());
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
};
|
|
@@ -544,13 +623,8 @@ public class PDFTableReader<T> {
|
|
|
cell.setPdFont(textPositions.get(0).getFont());
|
|
|
rowCells.add(cell);
|
|
|
cellRow.setCell(rowCells);
|
|
|
-
|
|
|
-
|
|
|
}
|
|
|
-
|
|
|
}
|
|
|
-
|
|
|
-
|
|
|
} else {
|
|
|
cell.setPositions(textPositions);
|
|
|
cell.setText(String.valueOf(key).split("-@@@-")[0]);
|
|
@@ -608,16 +682,17 @@ public class PDFTableReader<T> {
|
|
|
|
|
|
List<Float> dataLines = new ArrayList<>();
|
|
|
|
|
|
- dataColumn = 0;
|
|
|
- for(Float key : limitObject.keySet()) {
|
|
|
- if(limitObject.get(key).size() > dataColumn) {
|
|
|
- dataColumn = limitObject.get(key).size();
|
|
|
- }
|
|
|
- }
|
|
|
+ dataColumn = fields.size()/2 + 1;
|
|
|
+// for(Float key : limitObject.keySet()) {
|
|
|
+// if(limitObject.get(key).size() > dataColumn) {
|
|
|
+// dataColumn = limitObject.get(key).size();
|
|
|
+// }
|
|
|
+// }
|
|
|
for (Float key : limitObject.keySet()) {
|
|
|
boolean isHeader = false;
|
|
|
//去除标题行
|
|
|
|
|
|
+ //判断是否为数据行
|
|
|
if (limitObject.get(key).size() >= dataColumn) {
|
|
|
for (PdfRow headerRow : headerRows) {
|
|
|
if (headerRow.getRowY() == key) {
|
|
@@ -702,6 +777,140 @@ public class PDFTableReader<T> {
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
+// public static PdfTable getLinesByParsePdf(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, Map<String, List<PdfTextPosition>> documentPositions, String headerAlignment, String standardX, String headerYText, String[] extraLines, int headerLimit) {
|
|
|
+// PdfTable pdfTable = new PdfTable();
|
|
|
+// try {
|
|
|
+// InputStream inputStream = file.getInputStream();
|
|
|
+// PDDocument document = PDDocument.load(inputStream);
|
|
|
+//
|
|
|
+// PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+// private int pageNumber = 0;
|
|
|
+// float width = 0;
|
|
|
+// float height = 0;
|
|
|
+// float standardXPosition = 0;
|
|
|
+// float rowY = 0;
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// protected void writePage() throws IOException {
|
|
|
+// pageNumber++;
|
|
|
+// PDPage page = document.getPage(pageNumber - 1);
|
|
|
+// width = page.getMediaBox().getWidth();
|
|
|
+// height = page.getMediaBox().getHeight();
|
|
|
+// pdfTable.setWidth(width);
|
|
|
+// pdfTable.setHeight(height);
|
|
|
+// super.writePage();
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+// //设置左侧空白偏移量
|
|
|
+// if (text.contains(standardX)) {
|
|
|
+// pdfTable.setStandardPosition(textPositions.get(0).getX());
|
|
|
+// }
|
|
|
+//
|
|
|
+// //重置textPositions各字段高度,加上每页页码*高度
|
|
|
+// List<PdfTextPosition> positions = new ArrayList<>();
|
|
|
+// for (TextPosition textPosition : textPositions) {
|
|
|
+// PdfTextPosition position = new PdfTextPosition();
|
|
|
+// position.setX(textPosition.getX());
|
|
|
+// position.setY(textPosition.getY() + (pageNumber - 1) * height);
|
|
|
+// position.setPageNum(pageNumber);
|
|
|
+// position.setFont(textPosition.getFont());
|
|
|
+// position.setFontSize(textPosition.getFontSize());
|
|
|
+// position.setUnicode(textPosition.getUnicode());
|
|
|
+// positions.add(position);
|
|
|
+// }
|
|
|
+// //寻找表格中各字段隔绝行
|
|
|
+// if (extraLines != null) {
|
|
|
+// List<Float> extraLinesBorder = pdfTable.getExtraLinesBorder() == null ? new ArrayList<>() : pdfTable.getExtraLinesBorder();
|
|
|
+// for (String extra : extraLines) {
|
|
|
+// if (text.contains(extra)) {
|
|
|
+// extraLinesBorder.add(positions.get(0).getY());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// pdfTable.setExtraLinesBorder(extraLinesBorder);
|
|
|
+// }
|
|
|
+//
|
|
|
+// documentPositions.put(text + "-@@@-" + (positions.get(0).getX() + positions.get(0).getY()), positions);
|
|
|
+// //设置行高
|
|
|
+// if (text.contains(headerYText)) {
|
|
|
+// rowY = positions.get(0).getY();
|
|
|
+// PdfRow pdfRow = headerRows.get(rowY) == null ? new PdfRow() : headerRows.get(rowY);
|
|
|
+// pdfRow.setRowY(rowY);
|
|
|
+// headerRows.put(rowY, pdfRow);
|
|
|
+// }
|
|
|
+// //获取header所在行
|
|
|
+//
|
|
|
+// PdfRow headerRow = null;
|
|
|
+// List<PdfCell> pdfCells = null;
|
|
|
+// for (String filed : fields) {
|
|
|
+// if (text.contains(filed)) {
|
|
|
+// headerRow = headerRows.get(positions.get(0).getY()) == null ? new PdfRow() : headerRows.get(positions.get(0).getY());
|
|
|
+// pdfCells = headerRow.getCell() == null ? new ArrayList<>() : headerRow.getCell();
|
|
|
+// PdfCell pdfCell = new PdfCell();
|
|
|
+// //文本识别可能出错,会带多个字符,需识别标题在此文本中的位置并重新赋起始结束x值
|
|
|
+// int firstPosition = text.indexOf(filed);
|
|
|
+// int lastPosition = firstPosition + filed.length() - 1;
|
|
|
+// pdfCell.setCellStartX(positions.get(0).getX());
|
|
|
+// pdfCell.setCellEndX(positions.get(positions.size() - 1).getX());
|
|
|
+// pdfCell.setCellY(positions.get(0).getY());
|
|
|
+// pdfCell.setPdFont(textPositions.get(0).getFont());
|
|
|
+// pdfCell.setText(filed);
|
|
|
+// pdfCell.setFontSize(textPositions.get(0).getFontSize());
|
|
|
+// pdfCell.setCellStartX(textPositions.get(firstPosition).getX());
|
|
|
+// pdfCell.setCellEndX(textPositions.get(lastPosition).getX());
|
|
|
+// pdfCells.add(pdfCell);
|
|
|
+// headerRow.setCell(pdfCells);
|
|
|
+// headerRows.put(positions.get(0).getY(), headerRow);
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+//
|
|
|
+// //添加忽略行
|
|
|
+// for (String ignoreField : ignoreFields) {
|
|
|
+// if (text.contains(ignoreField)) {
|
|
|
+// Map<Float, Boolean> ignoreRows = pdfTable.getIgnoreRows() == null ? new HashMap<>() : pdfTable.getIgnoreRows();
|
|
|
+// PdfRow pdfRow = new PdfRow();
|
|
|
+// pdfRow.setRowY(positions.get(0).getY());
|
|
|
+// ignoreRows.put(pdfRow.getRowY(), true);
|
|
|
+// pdfTable.setIgnoreRows(ignoreRows);
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+//
|
|
|
+// //提供了表格结束行数据就用,不用就按给的字段模糊匹
|
|
|
+// if (tableEndIndex == -1) {
|
|
|
+// if (Strings.isBlank(endKey)) {
|
|
|
+// pdfTable.setTableEndY(document.getNumberOfPages() * height);
|
|
|
+// } else if (text.contains(endKey)) {
|
|
|
+// pdfTable.setTableEndY(positions.get(0).getY());
|
|
|
+// }
|
|
|
+// } else {
|
|
|
+// pdfTable.setTableEndY(tableEndIndex);
|
|
|
+// }
|
|
|
+//// if (headerRow != null && headerRow.getRowY() > 0) {
|
|
|
+//// if (headerRow.getCell() != null && headerRow.getCell().size() > 0) {
|
|
|
+//// pdfTable.setTableStartY(headerRow.getCell().get(0).getCellY());
|
|
|
+//// }
|
|
|
+//// // pdfTable.setHeaderRows(rows);
|
|
|
+//// }
|
|
|
+//
|
|
|
+// }
|
|
|
+//
|
|
|
+//
|
|
|
+// };
|
|
|
+//
|
|
|
+// textStripper.setSortByPosition(true);
|
|
|
+// textStripper.setStartPage(1);
|
|
|
+// textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+// textStripper.getText(document);
|
|
|
+// document.close();
|
|
|
+// inputStream.close();
|
|
|
+// } catch (Exception e) {
|
|
|
+// e.printStackTrace();
|
|
|
+// }
|
|
|
+// }
|
|
|
+
|
|
|
public static PdfTable getHeaderAndIgnoreLine(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, Map<String, List<PdfTextPosition>> documentPositions, String headerAlignment, String standardX, String headerYText, String[] extraLines, int headerLimit) {
|
|
|
PdfTable pdfTable = new PdfTable();
|
|
|
|