|
@@ -0,0 +1,419 @@
|
|
|
+package org.jeecg.modules.saleCode.util;
|
|
|
+
|
|
|
+import com.alibaba.fastjson.JSONArray;
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import lombok.AllArgsConstructor;
|
|
|
+import lombok.Getter;
|
|
|
+import org.apache.logging.log4j.util.Strings;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+import org.apache.pdfbox.text.TextPosition;
|
|
|
+import org.springframework.web.multipart.MultipartFile;
|
|
|
+import technology.tabula.*;
|
|
|
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
+
|
|
|
+
|
|
|
+import java.io.File;
|
|
|
+import java.io.IOException;
|
|
|
+import java.lang.reflect.Field;
|
|
|
+import java.util.*;
|
|
|
+
|
|
|
+public class PDFTableReader<T> {
|
|
|
+
|
|
|
+ private final static String LINE_WRAP = "\r";
|
|
|
+
|
|
|
+ * PDF数据缓冲器
|
|
|
+ */
|
|
|
+ private final List<Map<String, String>> dataBuff = new ArrayList<>();
|
|
|
+ private final transient File pdfFile;
|
|
|
+
|
|
|
+ * 表格标题属性
|
|
|
+ */
|
|
|
+ private final PDFTitle[] pdfTitles;
|
|
|
+
|
|
|
+
|
|
|
+ * 标题所在行号
|
|
|
+ * 默认第一行
|
|
|
+ */
|
|
|
+ private int titleRowIndex = 0;
|
|
|
+
|
|
|
+
|
|
|
+ * 描述信息
|
|
|
+ */
|
|
|
+ private String keyInfo;
|
|
|
+
|
|
|
+ * 关键信息所在页起始下标,从0开始
|
|
|
+ */
|
|
|
+ private int keyInfoPageNumber;
|
|
|
+
|
|
|
+ * 关键信息所在行起始下标
|
|
|
+ */
|
|
|
+ private int keyInfoLineStartIndex;
|
|
|
+
|
|
|
+ * 关键信息所在行结束下标
|
|
|
+ */
|
|
|
+ private int keyInfoLineEndIndex;
|
|
|
+
|
|
|
+ private final Class<T> beanClass;
|
|
|
+
|
|
|
+ private PDFTableReader(File pdfFile, Class<T> beanClass) {
|
|
|
+ this.pdfFile = pdfFile;
|
|
|
+ this.beanClass = beanClass;
|
|
|
+ if (beanClass == null) {
|
|
|
+ throw new RuntimeException("必须指定读取表格数据的实体类");
|
|
|
+ }
|
|
|
+ Field[] fields = beanClass.getDeclaredFields();
|
|
|
+ pdfTitles = new PDFTitle[fields.length];
|
|
|
+ for (int i = 0; i < fields.length; i++) {
|
|
|
+ Field field = fields[i];
|
|
|
+ TableTitle title = field.getAnnotation(TableTitle.class);
|
|
|
+ if (title == null) {
|
|
|
+ throw new RuntimeException(field.getName() + " attribute miss annotation TableTitle");
|
|
|
+ }
|
|
|
+ pdfTitles[i] = new PDFTitle(title.cellIndex(), title.value(), title.trimSpace());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static <T> PDFTableReader<T> load(File pdfFile, Class<T> beanClass) {
|
|
|
+ return new PDFTableReader<T>(pdfFile, beanClass);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ * 设置PDF表格标题所在行号
|
|
|
+ *
|
|
|
+ * @param titleRowIndex - 表头标题所在行下标,从0开始
|
|
|
+ */
|
|
|
+ public PDFTableReader<T> titleRowIndex(int titleRowIndex) {
|
|
|
+ this.titleRowIndex = titleRowIndex;
|
|
|
+ return this;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ * 设置PDF描述信息
|
|
|
+ *
|
|
|
+ * @param keyInfoPageNumber - 关键信息所在页起始下标,从0开始
|
|
|
+ * @param keyInfoLineStartIndex - 关键信息所在行起始下标,从0开始
|
|
|
+ * @param keyInfoLineEndIndex - 关键信息所在行结束下标,从0开始
|
|
|
+ */
|
|
|
+ public PDFTableReader<T> describe(int keyInfoPageNumber, int keyInfoLineStartIndex, int keyInfoLineEndIndex) {
|
|
|
+ this.keyInfoPageNumber = keyInfoPageNumber;
|
|
|
+ this.keyInfoLineStartIndex = keyInfoLineStartIndex;
|
|
|
+ this.keyInfoLineEndIndex = keyInfoLineEndIndex;
|
|
|
+ return this;
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Map<String, String>> readMaps() {
|
|
|
+ return this.doRead();
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<T> readBeans() {
|
|
|
+ List<T> beans = new ArrayList<>();
|
|
|
+ List<Map<String, String>> maps = this.doRead();
|
|
|
+ T bean;
|
|
|
+ try {
|
|
|
+ for (Map<String, String> map : maps) {
|
|
|
+ bean = this.beanClass.newInstance();
|
|
|
+ Field[] fields = bean.getClass().getDeclaredFields();
|
|
|
+ for (Field field : fields) {
|
|
|
+ TableTitle title = field.getAnnotation(TableTitle.class);
|
|
|
+ if (title == null) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (!field.isAccessible()) {
|
|
|
+ field.setAccessible(true);
|
|
|
+ }
|
|
|
+ field.set(bean, map.get(title.value()));
|
|
|
+ }
|
|
|
+ beans.add(bean);
|
|
|
+ }
|
|
|
+ return beans;
|
|
|
+ } catch (Exception e) {
|
|
|
+ throw new RuntimeException(e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<Map<String, String>> doRead() {
|
|
|
+ try (PDDocument document = PDDocument.load(pdfFile)) {
|
|
|
+
|
|
|
+ this.doReadKeyInfo(document);
|
|
|
+
|
|
|
+ List<Map<String, String>> maps = new ArrayList<>();
|
|
|
+ SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
|
|
+ PageIterator pi = new ObjectExtractor(document).extract();
|
|
|
+ Map<String, String> cellMap;
|
|
|
+ int rowNum = 0;
|
|
|
+ while (pi.hasNext()) {
|
|
|
+ Page page = pi.next();
|
|
|
+ List<Table> tables = sea.extract(page);
|
|
|
+ for (Table table : tables) {
|
|
|
+ List<List<RectangularTextContainer>> rows = table.getRows();
|
|
|
+ if (rows.size() <= titleRowIndex) {
|
|
|
+ throw new RuntimeException("标题行不正确");
|
|
|
+ }
|
|
|
+ for (List<RectangularTextContainer> row : rows) {
|
|
|
+ if (rowNum <= titleRowIndex) {
|
|
|
+ rowNum++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ cellMap = new HashMap<>(row.size());
|
|
|
+ for (int k = 0; k < row.size(); k++) {
|
|
|
+ RectangularTextContainer cell = row.get(k);
|
|
|
+ PDFTitle pdfTitle = pdfTitles[k];
|
|
|
+
|
|
|
+ if (pdfTitle.getCellIndex() != k) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ String cellText = cell.getText();
|
|
|
+ cellText = cellText == null ? "" : cellText.trim();
|
|
|
+ if (pdfTitle.isTrimSpace()) {
|
|
|
+ cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, ""));
|
|
|
+ } else {
|
|
|
+ cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, " "));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ maps.add(cellMap);
|
|
|
+ rowNum++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ this.dataBuff.addAll(maps);
|
|
|
+ return maps;
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new RuntimeException(e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doReadKeyInfo(PDDocument document) throws IOException {
|
|
|
+ PDFTextStripper stripper = new PDFTextStripper();
|
|
|
+ stripper.setSortByPosition(true);
|
|
|
+ stripper.setStartPage(keyInfoPageNumber);
|
|
|
+ stripper.setEndPage(keyInfoPageNumber + 1);
|
|
|
+ String result = stripper.getText(document);
|
|
|
+ result = result.replaceAll("\n", "");
|
|
|
+ String[] split = result.split(LINE_WRAP);
|
|
|
+ if (keyInfoLineStartIndex >= split.length || keyInfoLineEndIndex >= split.length) {
|
|
|
+ throw new RuntimeException("关键信息所在列已超出读取页码行的范围");
|
|
|
+ }
|
|
|
+ StringBuilder r = new StringBuilder();
|
|
|
+ for (int i = keyInfoLineStartIndex; i <= keyInfoLineEndIndex; i++) {
|
|
|
+ String line = split[i];
|
|
|
+ if (line == null || line.trim().equals("")) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ r.append(line.trim());
|
|
|
+ }
|
|
|
+ this.keyInfo = r.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Map<String, String>> getDataBuff() {
|
|
|
+ return dataBuff;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getKeyInfo() {
|
|
|
+ return keyInfo;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ @Getter
|
|
|
+ @AllArgsConstructor
|
|
|
+ static class PDFTitle {
|
|
|
+
|
|
|
+ private int cellIndex;
|
|
|
+
|
|
|
+ private String name;
|
|
|
+
|
|
|
+ private boolean trimSpace;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static Map<String, Float> getPoint(MultipartFile file, String sourceTex) {
|
|
|
+ Map<String, Float> points = new HashMap<>();
|
|
|
+
|
|
|
+ try {
|
|
|
+ PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ if (text.contains(sourceTex)) {
|
|
|
+ TextPosition textPositionStart = textPositions.get(0);
|
|
|
+ TextPosition textPositionEnd = textPositions.get(textPositions.size()-1);
|
|
|
+ points.put("x", textPositionStart.getX());
|
|
|
+ points.put("y", textPositionStart.getX());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(true);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+
|
|
|
+ textStripper.getText(document);
|
|
|
+
|
|
|
+ document.close();
|
|
|
+
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ return points;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<String> getFields(MultipartFile file, String startText, String endText, boolean sort) {
|
|
|
+
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ try {
|
|
|
+ PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ if(text.contains(startText)) {
|
|
|
+ int startIndex = 0;
|
|
|
+ int endIndex = text.length();
|
|
|
+ if(Strings.isNotBlank(startText)) {
|
|
|
+ startIndex = text.indexOf(startText);
|
|
|
+ }
|
|
|
+ if(Strings.isNotBlank(endText)) {
|
|
|
+ endIndex = text.indexOf(endText, startIndex + startText.length());
|
|
|
+ }
|
|
|
+ if(startIndex == -1) {
|
|
|
+ startIndex = 0;
|
|
|
+ }
|
|
|
+ if(endIndex == -1) {
|
|
|
+ endIndex = text.length();
|
|
|
+ }
|
|
|
+ result.add(text.substring(startIndex + startText.length(), endIndex));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(sort);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+
|
|
|
+ textStripper.getText(document);
|
|
|
+
|
|
|
+ document.close();
|
|
|
+
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ if(result == null && result.size() == 0) {
|
|
|
+ result.add(" ");
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ public static List<String> getMultipleLineFields(MultipartFile file, String lineStartText, String lineEndText, boolean sort) {
|
|
|
+
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ final boolean[] startRecord = {false};
|
|
|
+ try {
|
|
|
+ PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ if(text.contains(lineStartText)) {
|
|
|
+ startRecord[0] = true;
|
|
|
+ }
|
|
|
+ if(Strings.isNotEmpty(lineEndText)) {
|
|
|
+ if(text.contains(lineEndText)) {
|
|
|
+ startRecord[0] = false;
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(startRecord[0]) {
|
|
|
+ result.add(text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(sort);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+
|
|
|
+ textStripper.getText(document);
|
|
|
+
|
|
|
+ document.close();
|
|
|
+
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ if(result == null && result.size() == 0) {
|
|
|
+ result.add(" ");
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<String> getNextLineFields(MultipartFile file, String lineStartText) {
|
|
|
+
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ final boolean[] startRecord = {false};
|
|
|
+ try {
|
|
|
+ PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
+ @Override
|
|
|
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
+ if(text.contains(lineStartText)) {
|
|
|
+ startRecord[0] = true;
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ if(startRecord[0]) {
|
|
|
+ result.add(text);
|
|
|
+ startRecord[0] = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ textStripper.setSortByPosition(true);
|
|
|
+ textStripper.setStartPage(1);
|
|
|
+ textStripper.setEndPage(document.getNumberOfPages());
|
|
|
+
|
|
|
+ textStripper.getText(document);
|
|
|
+
|
|
|
+ document.close();
|
|
|
+
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ if(result == null && result.size() == 0) {
|
|
|
+ result.add(" ");
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static JSONArray getTable(MultipartFile file, List<String> header) {
|
|
|
+
|
|
|
+
|
|
|
+ JSONArray jsonArray = new JSONArray();
|
|
|
+ SpreadsheetExtractionAlgorithm algorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
+
|
|
|
+ try (PDDocument document = PDDocument.load(file.getInputStream())) {
|
|
|
+ ObjectExtractor extractor = new ObjectExtractor(document);
|
|
|
+ PageIterator pi = extractor.extract();
|
|
|
+
|
|
|
+ while (pi.hasNext()) {
|
|
|
+ Page page = pi.next();
|
|
|
+ List<Table> tableList = algorithm.extract(page);
|
|
|
+
|
|
|
+ for (Table table : tableList) {
|
|
|
+ List<List<RectangularTextContainer>> rowList = table.getRows();
|
|
|
+
|
|
|
+ for (List<RectangularTextContainer> row : rowList) {
|
|
|
+ JSONObject jsonObject = new JSONObject();
|
|
|
+
|
|
|
+ for (int i = 0; i < row.size(); i++) {
|
|
|
+ RectangularTextContainer cell = row.get(i);
|
|
|
+ String text = cell.getText().replace("\r", "");
|
|
|
+ jsonObject.put(header.get(i), text);
|
|
|
+ }
|
|
|
+ jsonArray.add(jsonObject);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ return jsonArray;
|
|
|
+ }
|
|
|
+}
|