Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/main/java/com/mindee/image/ExtractedImage.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.mindee.image;

import com.mindee.MindeeException;
import com.mindee.input.LocalInputSource;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
Expand Down Expand Up @@ -62,9 +63,13 @@ public void writeToFile(String outputPath) throws IOException {
*/
public void writeToFile(Path outputPath) throws IOException {
if (!Files.isDirectory(outputPath)) {
throw new IllegalArgumentException("Provided path is not a directory.");
throw new MindeeException("Provided path is not a directory.");
}
try {
ImageIO.write(this.image, this.saveFormat, outputPath.resolve(this.filename).toFile());
} catch (IOException e) {
throw new MindeeException("Could not save file " + this.filename + ".", e);
}
ImageIO.write(this.image, this.saveFormat, outputPath.resolve(this.filename).toFile());
}

/**
Expand Down
20 changes: 13 additions & 7 deletions src/main/java/com/mindee/pdf/BasePDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,19 @@ public BasePDFExtractor(LocalInputSource source) throws IOException {
}
}

public ExtractedPDF extractSinglePage(
List<Integer> pageNumbers,
public ExtractedPDF extractSingleDocument(
List<Integer> pageIndexes,
boolean closeOriginal
) throws IOException {
if (pageNumbers.isEmpty()) {
if (pageIndexes.isEmpty()) {
throw new MindeeException("Empty indexes not allowed for extraction.");
}
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
return new ExtractedPDF(pdfBytes, makeFilename(pageNumbers));
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageIndexes, closeOriginal);
return new ExtractedPDF(
pdfBytes,
makeFilename(pageIndexes),
pageIndexes.stream().mapToInt(Integer::intValue).toArray()
);
}

/**
Expand All @@ -73,11 +77,13 @@ public ExtractedPDF extractSinglePage(
* @return A list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws IOException {
public ExtractedPDFs extractMultipleDocuments(
List<List<Integer>> pageIndexes
) throws IOException {
var extractedPDFs = new ExtractedPDFs();

for (List<Integer> pageIndexElement : pageIndexes) {
extractedPDFs.add(extractSinglePage(pageIndexElement, false));
extractedPDFs.add(extractSingleDocument(pageIndexElement, false));
}
return extractedPDFs;
}
Expand Down
38 changes: 28 additions & 10 deletions src/main/java/com/mindee/pdf/ExtractedPDF.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.mindee.pdf;

import com.mindee.MindeeException;
import com.mindee.input.LocalInputSource;
import java.io.IOException;
import java.nio.file.Files;
Expand All @@ -12,51 +13,68 @@
*/
@Getter
public class ExtractedPDF {
/**
* PDF content as bytes.
*/
private final byte[] fileBytes;
/**
* Name of the file when writing to disk.
*/
private final String filename;
/**
* 0-based indexes of all pages taken from the original PDF.
*/
private final int[] pageIndexes;
/**
* The number of pages in this PDF file.
*/
private final int pageCount;

/**
* Default constructor.
*
* @param fileBytes PDF file as bytes.
* @param filename Name of the extracted file.
* @param pageIndexes Two-element array: index of the first and last extracted page.
*/
public ExtractedPDF(byte[] fileBytes, String filename) {
public ExtractedPDF(byte[] fileBytes, String filename, int[] pageIndexes) {
this.fileBytes = fileBytes;
this.filename = filename;
this.pageIndexes = pageIndexes;
this.pageCount = pageIndexes.length;
}

/**
* Write the extracted PDF to a file.
*
* @param outputPath the output path, it may be a file or a directory.
* @throws IOException Throws if the file can't be accessed.
*/
public void writeToFile(Path outputPath) throws IOException {
public void writeToFile(Path outputPath) throws MindeeException {
if (!Files.isDirectory(outputPath)) {
throw new IllegalArgumentException("Provided path is not a directory.");
throw new MindeeException("Provided path is not a directory.");
}
try {
Files.write(outputPath.resolve(this.filename), this.fileBytes);
} catch (IOException e) {
throw new MindeeException("Could not save file " + this.filename + ".", e);
}

Files.write(outputPath.resolve(this.filename), this.fileBytes);
}

/**
* Write the extracted PDF to a file.
*
* @param outputPath the output path, it may be a file or a directory.
* @throws IOException Throws if the file can't be accessed.
*/
public void writeToFile(String outputPath) throws IOException {
public void writeToFile(String outputPath) throws MindeeException {
writeToFile(Paths.get(outputPath));
}

/**
* Return the file in a format suitable for sending to MindeeClient for parsing.
*
* @return an instance of {@link LocalInputSource}
* @throws IOException Throws if the file can't be accessed.
*/
public LocalInputSource asInputSource() throws IOException {
public LocalInputSource asInputSource() {
return new LocalInputSource(this.fileBytes, this.filename);
}
}
4 changes: 2 additions & 2 deletions src/main/java/com/mindee/v1/fileoperations/PDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public List<ExtractedPDF> extractInvoices(
.map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
.collect(Collectors.toList());

return extractSubDocuments(indexes);
return extractMultipleDocuments(indexes);
}

/**
Expand Down Expand Up @@ -81,7 +81,7 @@ public List<ExtractedPDF> extractInvoices(
}
previousConfidence = confidence;
}
return extractSubDocuments(correctPageIndexes);
return extractMultipleDocuments(correctPageIndexes);
}

}
4 changes: 2 additions & 2 deletions src/main/java/com/mindee/v2/fileoperations/Split.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ public Split(LocalInputSource inputSource) throws IOException {
}

public ExtractedPDF extractSingleSplit(SplitRange splitRange) throws IOException {
return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true);
return this.pdfSplitter.extractSingleDocument(splitRange.getPageRangeDistinct(), true);
}

public ExtractedPDFs extractMultipleSplits(ArrayList<SplitRange> splitRanges) throws IOException {
return this.pdfSplitter
.extractSubDocuments(
.extractMultipleDocuments(
splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList())
);
}
Expand Down
12 changes: 6 additions & 6 deletions src/test/java/com/mindee/v2/fileoperations/SplitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import static com.mindee.TestingUtilities.deleteRecursively;
import static com.mindee.TestingUtilities.getResourcePath;
import static com.mindee.TestingUtilities.getV2ResourcePath;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;

import com.mindee.input.LocalInputSource;
Expand Down Expand Up @@ -34,8 +35,7 @@ void singlePage_splitsCorrectly() throws IOException {
.extractSingleSplit(doc.getInference().getResult().getSplits().get(0));

assertEquals("default_sample_pages-001-001.pdf", extractedSplit.getFilename());
var asInputSource = extractedSplit.asInputSource();
assertEquals(1, asInputSource.getPageCount());
assertEquals(1, extractedSplit.getPageCount());

extractedSplit.writeToFile(outputPath);
}
Expand All @@ -54,13 +54,13 @@ void multiplePages_splitsCorrectly() throws IOException {

var split0 = extractedSplits.get(0);
assertEquals("default_sample_pages-001-001.pdf", split0.getFilename());
var asInputSource0 = split0.asInputSource();
assertEquals(1, asInputSource0.getPageCount());
assertEquals(1, split0.getPageCount());
assertArrayEquals(new int[] { 0 }, split0.getPageIndexes());

var split1 = extractedSplits.get(1);
assertEquals("default_sample_pages-002-002.pdf", split1.getFilename());
var asInputSource1 = split1.asInputSource();
assertEquals(1, asInputSource1.getPageCount());
assertEquals(1, split0.getPageCount());
assertArrayEquals(new int[] { 1 }, split1.getPageIndexes());

extractedSplits.saveAllToDisk(outputPath);
}
Expand Down
Loading