Skip to content

Commit

Permalink
Merge pull request #2 from BobLd/pdfpig-0.1.4
Browse files Browse the repository at this point in the history
Pdfpig 0.1.4
  • Loading branch information
BobLd authored Nov 29, 2020
2 parents f5b49f0 + eecf84f commit e7c0dfe
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Core;
using UglyToad.PdfPig.Graphics.Colors;
using UglyToad.PdfPig.Rendering;
using static UglyToad.PdfPig.Core.PdfSubpath;

namespace Camelot.ImageProcessing.OpenCvSharp4
{
/// <summary>
/// Only draws pdf paths and images - letters are ignored.
/// </summary>
public class BasicSystemDrawingProcessor : IDrawingProcessor
public class BasicSystemDrawingProcessor : IPageImageRenderer
{
private static Matrix GetInitialMatrix(int rotation, CropBox mediaBox)
{
Expand Down Expand Up @@ -51,9 +52,16 @@ private static Matrix GetInitialMatrix(int rotation, CropBox mediaBox)
dx, dy);
}

public MemoryStream DrawPage(Page page, double pageScale)
/// <summary>
/// <inheritdoc/>
/// </summary>
/// <param name="page"></param>
/// <param name="pageScale"></param>
/// <param name="imageFormat"></param>
/// <returns></returns>
public byte[] Render(Page page, double pageScale, PdfRendererImageFormat imageFormat = PdfRendererImageFormat.Png)
{
var ms = new MemoryStream();
using (var ms = new MemoryStream())
using (var bitmap = new Bitmap((int)Math.Ceiling(page.Width * pageScale), (int)Math.Ceiling(page.Height * pageScale), PixelFormat.Format32bppRgb))
using (var currentGraphics = Graphics.FromImage(bitmap))
{
Expand Down Expand Up @@ -141,9 +149,9 @@ public MemoryStream DrawPage(Page page, double pageScale)
}
}

bitmap.Save(ms, ImageFormat.Png);
bitmap.Save(ms, ToSystemImageFormat(imageFormat));
return ms.ToArray();
}
return ms;
}

private void DrawImage(IPdfImage image, Graphics graphics)
Expand Down Expand Up @@ -194,6 +202,28 @@ private void DrawImage(IPdfImage image, Graphics graphics)
}
}

private static ImageFormat ToSystemImageFormat(PdfRendererImageFormat pdfRendererImageFormat)
{
switch(pdfRendererImageFormat)
{
case PdfRendererImageFormat.Bmp:
return ImageFormat.Bmp;

case PdfRendererImageFormat.Gif:
return ImageFormat.Gif;

case PdfRendererImageFormat.Jpeg:
return ImageFormat.Jpeg;

case PdfRendererImageFormat.Png:
default:
return ImageFormat.Png;

case PdfRendererImageFormat.Tiff:
return ImageFormat.Tiff;
}
}

/// <summary>
/// Default to Black.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@

<ItemGroup>
<PackageReference Include="OpenCvSharp4.runtime.win" Version="4.5.0.20201013" />
<PackageReference Include="PdfPig" Version="0.1.4" />
</ItemGroup>

<ItemGroup>
Expand Down
10 changes: 4 additions & 6 deletions Camelot.ImageProcessing.OpenCvSharp4/OpenCvImageProcesser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Diagnostics;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Rendering;

namespace Camelot.ImageProcessing
{
Expand All @@ -25,7 +26,7 @@ public class OpenCvImageProcesser : IImageProcesser
/// Process the page to extract the tables.
/// </summary>
/// <param name="page"></param>
/// <param name="drawingProcessor"></param>
/// <param name="imageRenderer"></param>
/// <param name="process_background">Whether or not to process lines that are in background.</param>
/// <param name="blocksize">Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on.
/// <para>For more information, refer `OpenCV's adaptiveThreshold https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold`.</para></param>
Expand All @@ -42,7 +43,7 @@ public class OpenCvImageProcesser : IImageProcesser
/// <para>vertical_segments - vertical lines (in PDF corrdinate)</para>
/// horizontal_segments - horizontal lines (in PDF corrdinate)</returns>
public (Dictionary<(float x1, float y1, float x2, float y2), List<(float, float)>> table_bbox, List<(float, float, float, float)> vertical_segments, List<(float, float, float, float)> horizontal_segments)
Process(Page page, IDrawingProcessor drawingProcessor, bool process_background,
Process(Page page, IPageImageRenderer imageRenderer, bool process_background,
int blocksize = 15, int c = -2, int line_scale = 15, int iterations = 0,
List<(float x1, float y1, float x2, float y2)> table_areas = null,
List<(float x1, float y1, float x2, float y2)> table_regions = null)
Expand All @@ -54,10 +55,7 @@ public class OpenCvImageProcesser : IImageProcesser

List<(int, int, int, int)> horizontal_segments;

#pragma warning disable IDE0063 // Use simple 'using' statement
using (var ms = drawingProcessor.DrawPage(page, 3))
#pragma warning restore IDE0063 // Use simple 'using' statement
using (var image = Mat.FromImageData(ms.ToArray()))
using (var image = Mat.FromImageData(imageRenderer.Render(page, 3, PdfRendererImageFormat.Png).ToArray()))
{
(Mat img, Mat threshold) = AdaptiveThreshold(
image,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ public void DrawScale1()
{
var page = document.GetPage(1); // always page 1 for the moment

var stream = draw.DrawPage(page, 1);
#pragma warning disable IDE0063 // Use simple 'using' statement
using (var img = Bitmap.FromStream(stream))
using (var stream = new MemoryStream(draw.Render(page, 1)))
using (var img = Image.FromStream(stream))
#pragma warning restore IDE0063 // Use simple 'using' statement
{
img.Save(@"Files\Output\foo_basic_render_1.png");
Expand All @@ -46,8 +46,8 @@ public void DrawScale3()
{
var page = document.GetPage(1); // always page 1 for the moment

var stream = draw.DrawPage(page, 3);
#pragma warning disable IDE0063 // Use simple 'using' statement
using (var stream = new MemoryStream(draw.Render(page, 3)))
using (var img = Bitmap.FromStream(stream))
#pragma warning restore IDE0063 // Use simple 'using' statement
{
Expand Down
2 changes: 1 addition & 1 deletion Camelot.Tests/Camelot.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.5.0" />
<PackageReference Include="PdfPig" Version="0.1.3" />
<PackageReference Include="PdfPig" Version="0.1.4" />
<PackageReference Include="xunit" Version="2.4.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.0" />
<PackageReference Include="coverlet.collector" Version="1.2.0" />
Expand Down
7 changes: 6 additions & 1 deletion Camelot.Tests/StreamTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,12 @@ public void ExtractTables()
Assert.Equal((612, 792), stream.Dimensions);
Assert.Equal(612, stream.PdfWidth);
Assert.Equal(792, stream.PdfHeight);
Assert.Equal(84, stream.HorizontalText.Count);
//Assert.Equal(84, stream.HorizontalText.Count);

var parsingReport = tables[0].ParsingReport();
// parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
parsingReport["order"] = 1;
parsingReport["page"] = 1;
}
}

Expand Down
2 changes: 1 addition & 1 deletion Camelot/Camelot.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="PdfPig" Version="0.1.3" />
<PackageReference Include="PdfPig" Version="0.1.4" />
</ItemGroup>

</Project>
3 changes: 2 additions & 1 deletion Camelot/ImageProcessing/DefaultImageProcesser.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using System;
using System.Collections.Generic;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Rendering;

namespace Camelot.ImageProcessing
{
public class DefaultImageProcesser : IImageProcesser
{
public (Dictionary<(float x1, float y1, float x2, float y2), List<(float, float)>> table_bbox, List<(float, float, float, float)> vertical_segments, List<(float, float, float, float)> horizontal_segments)
Process(Page page, IDrawingProcessor drawingProcessor, bool process_background, int threshold_blocksize, int threshold_constant, int line_scale, int iterations,
Process(Page page, IPageImageRenderer pageImageRenderer, bool process_background, int threshold_blocksize, int threshold_constant, int line_scale, int iterations,
List<(float x1, float y1, float x2, float y2)> table_areas, List<(float x1, float y1, float x2, float y2)> table_regions)
{
if (table_areas == null || table_areas.Count == 0)
Expand Down
17 changes: 0 additions & 17 deletions Camelot/ImageProcessing/IDrawingProcessor.cs

This file was deleted.

3 changes: 2 additions & 1 deletion Camelot/ImageProcessing/IImageProcesser.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Rendering;

namespace Camelot.ImageProcessing
{
Expand All @@ -21,7 +22,7 @@ public interface IImageProcesser
/// <para>vertical_segments - vertical lines (in PDF corrdinate)</para>
/// horizontal_segments - horizontal lines (in PDF corrdinate)</returns>
(Dictionary<(float x1, float y1, float x2, float y2), List<(float, float)>> table_bbox, List<(float, float, float, float)> vertical_segments, List<(float, float, float, float)> horizontal_segments)
Process(Page page, IDrawingProcessor drawingProcessor,
Process(Page page, IPageImageRenderer pageImageRenderer,
bool process_background, int threshold_blocksize, int threshold_constant, int line_scale, int iterations,
List<(float x1, float y1, float x2, float y2)> table_areas, List<(float x1, float y1, float x2, float y2)> table_regions);
}
Expand Down
11 changes: 6 additions & 5 deletions Camelot/Parsers/Lattice.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using UglyToad.PdfPig.Logging;
using UglyToad.PdfPig.Rendering;
using static Camelot.Core;

namespace Camelot.Parsers
Expand Down Expand Up @@ -102,7 +103,7 @@ public class Lattice : BaseParser
/// <summary>
/// Drawing Processor.
/// </summary>
public IDrawingProcessor DrawingProcessor { get; }
public IPageImageRenderer PageImageRenderer { get; }

private Dictionary<string, List<TextLine>> tBbox;
private List<(float, float, float, float)> verticalSegments;
Expand All @@ -121,7 +122,7 @@ public Lattice()
/// Lattice method of parsing looks for lines between text to parse the table.
/// </summary>
/// <param name="imageProcesser"></param>
/// <param name="drawingProcessor"></param>
/// <param name="pageImageRenderer"></param>
/// <param name="table_regions">List of page regions that may contain tables of the form x1,y1,x2,y2 where(x1, y1) -> left-top and(x2, y2) -> right-bottom in PDF coordinate space.</param>
/// <param name="table_areas">List of table area strings of the form x1,y1,x2,y2 where(x1, y1) -> left-top and(x2, y2) -> right-bottom in PDF coordinate space.</param>
/// <param name="process_background">Process background lines.</param>
Expand All @@ -143,7 +144,7 @@ public Lattice()
/// <param name="log"></param>
/// <param name="kwargs"></param>
public Lattice(IImageProcesser imageProcesser,
IDrawingProcessor drawingProcessor,
IPageImageRenderer pageImageRenderer,
List<(float x1, float y1, float x2, float y2)> table_regions = null,
List<(float x1, float y1, float x2, float y2)> table_areas = null,
bool process_background = false,
Expand All @@ -162,7 +163,7 @@ public Lattice(IImageProcesser imageProcesser,
ILog log = null) : base(log)
{
ImageProcesser = imageProcesser;
DrawingProcessor = drawingProcessor;
PageImageRenderer = pageImageRenderer;

TableRegions = table_regions;
TableAreas = table_areas;
Expand Down Expand Up @@ -309,7 +310,7 @@ private void GenerateTableBbox()
{
(tableBbox, verticalSegments, horizontalSegments) = ImageProcesser.Process(
Layout,
DrawingProcessor,
PageImageRenderer,
ProcessBackground,
ThresholdBlocksize,
ThresholdConstant,
Expand Down

0 comments on commit e7c0dfe

Please sign in to comment.