大学IT网 - 最懂大学生的IT学习网站! QQ资料交流群:367606806
当前位置:大学IT网 > C#技巧 > C#解析PDF

C#解析PDF

关键词:C#PDF  阅读(4972) 赞(16)

[摘要]本文是对C#解析PDF的讲解,对学习C#编程技术有所帮助,与大家分享。

C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。

PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。

文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。

ITestSharp是一C#开源项目,PdfBox为Java开源项目,借助于IKVM在.Net平台下有实现。

Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。

OCR使用的是Asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。

附上ITestSharp、PdfBox对PDF的解析代码。

ITestSharp辅助类

 using System;
 using System.Collections.Generic;
 using System.Text;
 
 using iTextSharp.text.pdf;
 using iTextSharp.text.pdf.parser;
 using System.IO;
 
 namespace eyuan
 {
     public static class ITextSharpHandler
     {
         /// <summary>
         /// 读取PDF文本内容
         /// </summary>
         /// <param name="fileName"></param>
         /// <returns></returns>
         public static string ReadPdf(string fileName)
         {
             if (!File.Exists(fileName))
             {
                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
                 return string.Empty;
             }
             //
             string fileContent = string.Empty;
             StringBuilder sbFileContent = new StringBuilder();
             //打开文件
             PdfReader reader = null;
             try
             {
                 reader = new PdfReader(fileName);
             }
             catch (Exception ex)
             {
                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 
                 if (reader != null)
                 {
                     reader.Close();
                     reader = null;
                 }
 
                 return string.Empty;
             }
 
             try
             {
                 //循环各页(索引从1开始)
                 for (int i = 1; i <= reader.NumberOfPages; i++)
                 {
                     sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
 
                 }
 
             }
             catch (Exception ex)
             {
                 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 
             }
             finally
             {
                 if (reader != null)
                 {
                     reader.Close();
                     reader = null;
                 }
             }
             //
             fileContent = sbFileContent.ToString();
             return fileContent;
         }
         /// <summary>
         /// 获取PDF页数
         /// </summary>
         /// <param name="fileName"></param>
         /// <returns></returns>
         public static int GetPdfPageCount(string fileName)
         {
             if (!File.Exists(fileName))
             {
                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
                 return -1;
             }
             //打开文件
             PdfReader reader = null;
             try
             {
                 reader = new PdfReader(fileName);
             }
             catch (Exception ex)
             {
                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 
                 if (reader != null)
                 {
                     reader.Close();
                     reader = null;
                 }
 
                 return -1;
             }
             //
             return reader.NumberOfPages;
         }
     }
 }

PDFBox辅助类

 using org.pdfbox.pdmodel;
 using org.pdfbox.util;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;
 
 namespace eyuan
 {
     public static class PdfBoxHandler
     {
         /// <summary>
         /// 使用PDFBox组件进行解析
         /// </summary>
         /// <param name="input">PDF文件路径</param>
         /// <returns>PDF文本内容</returns>
         public static string ReadPdf(string input)
         {
             if (!File.Exists(input))
             {
                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);
                 return null;
             }
             else
             {
                 PDDocument pdfdoc = null;
                 string strPDFText = null;
                 PDFTextStripper stripper = null;
 
                 try
                 {
                     //加载PDF文件
                     pdfdoc = PDDocument.load(input);
                 }
                 catch (Exception ex)
                 {
                     LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
 
                     if (pdfdoc != null)
                     {
                         pdfdoc.close();
                         pdfdoc = null;
                     }
 
                     return null;
                 }
 
                 try
                 {
                     //解析PDF文件
                     stripper = new PDFTextStripper();
                     strPDFText = stripper.getText(pdfdoc);
 
                    
 
                 }
                 catch (Exception ex)
                 {
                     LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
 
                 }
                 finally
                 {
                     if (pdfdoc != null)
                     {
                         pdfdoc.close();
                         pdfdoc = null;
                     }
                 }
 
                 return strPDFText;
             }
 
         }
     }
 }

另外附上PDF转Image,然后对Image进行OCR的代码。

转换PDF为Jpeg图片代码(GhostScript辅助类)

 using System;
 using System.Collections;
 using System.Collections.Generic;
 using System.Runtime.InteropServices;
 using System.Text;
 
 namespace eyuan
 {
     public class GhostscriptHandler
     {
 
         #region GhostScript Import
         /// <summary>创建Ghostscript的实例
         /// This instance is passed to most other gsapi functions. 
         /// The caller_handle will be provided to callback functions.  
         ///  At this stage, Ghostscript supports only one instance. </summary>  
         /// <param name="pinstance"></param>  
         /// <param name="caller_handle"></param>  
         /// <returns></returns>   
         [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")]
         private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle);
         /// <summary>This is the important function that will perform the conversion
         /// 
         /// </summary>  
         /// <param name="instance"></param>  
         /// <param name="argc"></param>  
         /// <param name="argv"></param>  
         /// <returns></returns>  
         [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")]
         private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv);
         /// <summary>  
         /// Exit the interpreter. 
         /// This must be called on shutdown if gsapi_init_with_args() has been called, 
         /// and just before gsapi_delete_instance().
         /// 退出
         /// </summary>  
         /// <param name="instance"></param>  
         /// <returns></returns>  
         [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")]
         private static extern int gsapi_exit(IntPtr instance);
         /// <summary>  
         /// Destroy an instance of Ghostscript. 
         /// Before you call this, Ghostscript must have finished. 
         /// If Ghostscript has been initialised, you must call gsapi_exit before gsapi_delete_instance.   
         /// 销毁实例
         /// </summary>  
         /// <param name="instance"></param>  
         [DllImport("gsdll32.dll", EntryPoint = "gsapi_delete_instance")]
         private static extern void gsapi_delete_instance(IntPtr instance);
         #endregion
 
         #region 变量
         private string _sDeviceFormat;
         private int _iWidth;
         private int _iHeight;
         private int _iResolutionX;
         private int _iResolutionY;
         private int _iJPEGQuality;
         private Boolean _bFitPage;
         private IntPtr _objHandle;
         #endregion
 
         #region 属性
         /// <summary>
         /// 输出格式
         /// </summary>
         public string OutputFormat
         {
             get { return _sDeviceFormat; }
             set { _sDeviceFormat = value; }
         }
         /// <summary>
         /// 
         /// </summary>
         public int Width
         {
             get { return _iWidth; }
             set { _iWidth = value; }
         }
         /// <summary>
         /// 
         /// </summary>
         public int Height
         {
             get { return _iHeight; }
             set { _iHeight = value; }
         }
         /// <summary>
         /// 
         /// </summary>
         public int ResolutionX
         {
             get { return _iResolutionX; }
             set { _iResolutionX = value; }
         }
         /// <summary>
         /// 
         /// </summary>
         public int ResolutionY
         {
             get { return _iResolutionY; }
             set { _iResolutionY = value; }
         }
         /// <summary>
         /// 
         /// </summary>
         public Boolean FitPage
         {
             get { return _bFitPage; }
             set { _bFitPage = value; }
         }
         /// <summary>Quality of compression of JPG
         /// Jpeg文档质量
         /// </summary>  
         public int JPEGQuality
         {
             get { return _iJPEGQuality; }
             set { _iJPEGQuality = value; }
         }
         #endregion
 
         #region 初始化(实例化对象)
         /// <summary>
         /// 
         /// </summary>
         /// <param name="objHandle"></param>
         public GhostscriptHandler(IntPtr objHandle)
         {
             _objHandle = objHandle;
         }
         public GhostscriptHandler()
         {
             _objHandle = IntPtr.Zero;
         }
         #endregion
 
         #region 字符串处理
         /// <summary>
         /// 转换Unicode字符串到Ansi字符串
         /// </summary>
         /// <param name="str">Unicode字符串</param>
         /// <returns>Ansi字符串(字节数组格式)</returns>
         private byte[] StringToAnsiZ(string str)
         {
             //' Convert a Unicode string to a null terminated Ansi string for Ghostscript.  
             //' The result is stored in a byte array. Later you will need to convert  
             //' this byte array to a pointer with GCHandle.Alloc(XXXX, GCHandleType.Pinned)  
             //' and GSHandle.AddrOfPinnedObject()  
             int intElementCount;
             int intCounter;
             byte[] aAnsi;
             byte bChar;
             intElementCount = str.Length;
             aAnsi = new byte[intElementCount + 1];
             for (intCounter = 0; intCounter < intElementCount; intCounter++)
             {
                 bChar = (byte)str[intCounter];
                 aAnsi[intCounter] = bChar;
             }
             aAnsi[intElementCount] = 0;
             return aAnsi;
         }
         #endregion
 
         #region 转换文件
         /// <summary>
         /// 转换文件
         /// </summary>
         /// <param name="inputFile">输入的PDF文件路径</param>
         /// <param name="outputFile">输出的Jpeg图片路径</param>
         /// <param name="firstPage">第一页</param>
         /// <param name="lastPage">最后一页</param>
         /// <param name="deviceFormat">格式(文件格式)</param>
         /// <param name="width">宽度</param>
         /// <param name="height">高度</param>
         public void Convert(string inputFile, string outputFile,
             int firstPage, int lastPage, string deviceFormat, int width, int height)
         {
             //判断文件是否存在
             if (!System.IO.File.Exists(inputFile))
             {
                 LogHandler.LogWrite(string.Format("文件{0}不存在", inputFile));
                 return;
             }
             int intReturn;
             IntPtr intGSInstanceHandle;
             object[] aAnsiArgs;
             IntPtr[] aPtrArgs;
             GCHandle[] aGCHandle;
             int intCounter;
             int intElementCount;
             IntPtr callerHandle;
             GCHandle gchandleArgs;
             IntPtr intptrArgs;
             string[] sArgs = GetGeneratedArgs(inputFile, outputFile,
                 firstPage, lastPage, deviceFormat, width, height);
             // Convert the Unicode strings to null terminated ANSI byte arrays  
             // then get pointers to the byte arrays.  
             intElementCount = sArgs.Length;
             aAnsiArgs = new object[intElementCount];
             aPtrArgs = new IntPtr[intElementCount];
             aGCHandle = new GCHandle[intElementCount];
             // Create a handle for each of the arguments after   
             // they've been converted to an ANSI null terminated  
             // string. Then store the pointers for each of the handles  
             for (intCounter = 0; intCounter < intElementCount; intCounter++)
             {
                 aAnsiArgs[intCounter] = StringToAnsiZ(sArgs[intCounter]);
                 aGCHandle[intCounter] = GCHandle.Alloc(aAnsiArgs[intCounter], GCHandleType.Pinned);
                 aPtrArgs[intCounter] = aGCHandle[intCounter].AddrOfPinnedObject();
             }
             // Get a new handle for the array of argument pointers  
             gchandleArgs = GCHandle.Alloc(aPtrArgs, GCHandleType.Pinned);
             intptrArgs = gchandleArgs.AddrOfPinnedObject();
             intReturn = gsapi_new_instance(out intGSInstanceHandle, _objHandle);
             callerHandle = IntPtr.Zero;
             try
             {
                 intReturn = gsapi_init_with_args(intGSInstanceHandle, intElementCount, intptrArgs);
             }
             catch (Exception ex)
             {
                  LogHandler.LogWrite(string.Format("PDF文件{0}转换失败.\n错误:{1}",new string[]{inputFile,ex.ToString()}));
 
             }
             finally
             {
                 for (intCounter = 0; intCounter < intReturn; intCounter++)
                 {
                     aGCHandle[intCounter].Free();
                 }
                 gchandleArgs.Free();
                 gsapi_exit(intGSInstanceHandle);
                 gsapi_delete_instance(intGSInstanceHandle);
             }
         }
         #endregion
 
         #region 转换文件
         /// <summary>
         /// 
         /// </summary>
         /// <param name="inputFile"></param>
         /// <param name="outputFile"></param>
         /// <param name="firstPage"></param>
         /// <param name="lastPage"></param>
         /// <param name="deviceFormat"></param>
         /// <param name="width"></param>
         /// <param name="height"></param>
         /// <returns></returns>
         private string[] GetGeneratedArgs(string inputFile, string outputFile,
             int firstPage, int lastPage, string deviceFormat, int width, int height)
         {
             this._sDeviceFormat = deviceFormat;
             this._iResolutionX = width;
             this._iResolutionY = height;
             // Count how many extra args are need - HRangel - 11/29/2006, 3:13:43 PM  
             ArrayList lstExtraArgs = new ArrayList();
             if (_sDeviceFormat == "jpg" && _iJPEGQuality > 0 && _iJPEGQuality < 101)
                 lstExtraArgs.Add("-dJPEGQ=" + _iJPEGQuality);
             if (_iWidth > 0 && _iHeight > 0)
                 lstExtraArgs.Add("-g" + _iWidth + "x" + _iHeight);
             if (_bFitPage)
                 lstExtraArgs.Add("-dPDFFitPage");
             if (_iResolutionX > 0)
             {
                 if (_iResolutionY > 0)
                     lstExtraArgs.Add("-r" + _iResolutionX + "x" + _iResolutionY);
                 else
                     lstExtraArgs.Add("-r" + _iResolutionX);
             }
             // Load Fixed Args - HRangel - 11/29/2006, 3:34:02 PM  
             int iFixedCount = 17;
             int iExtraArgsCount = lstExtraArgs.Count;
             string[] args = new string[iFixedCount + lstExtraArgs.Count];
             /* 
             // Keep gs from writing information to standard output 
         "-q",                      
         "-dQUIET", 
 
         "-dPARANOIDSAFER", // Run this command in safe mode 
         "-dBATCH", // Keep gs from going into interactive mode 
         "-dNOPAUSE", // Do not prompt and pause for each page 
         "-dNOPROMPT", // Disable prompts for user interaction            
         "-dMaxBitmap=500000000", // Set high for better performance 
 
         // Set the starting and ending pages 
         String.Format("-dFirstPage={0}", firstPage), 
         String.Format("-dLastPage={0}", lastPage),    
 
         // Configure the output anti-aliasing, resolution, etc 
         "-dAlignToPixels=0", 
         "-dGridFitTT=0", 
         "-sDEVICE=jpeg", 
         "-dTextAlphaBits=4", 
         "-dGraphicsAlphaBits=4", 
             */
             args[0] = "pdf2img";//this parameter have little real use  
             args[1] = "-dNOPAUSE";//I don't want interruptions  
             args[2] = "-dBATCH";//stop after  
             //args[3]="-dSAFER";  
             args[3] = "-dPARANOIDSAFER";
             args[4] = "-sDEVICE=" + _sDeviceFormat;//what kind of export format i should provide  
             args[5] = "-q";
             args[6] = "-dQUIET";
             args[7] = "-dNOPROMPT";
             args[8] = "-dMaxBitmap=500000000";
             args[9] = String.Format("-dFirstPage={0}", firstPage);
             args[10] = String.Format("-dLastPage={0}", lastPage);
             args[11] = "-dAlignToPixels=0";
             args[12] = "-dGridFitTT=0";
             args[13] = "-dTextAlphaBits=4";
             args[14] = "-dGraphicsAlphaBits=4";
             //For a complete list watch here:  
             //http://pages.cs.wisc.edu/~ghost/doc/cvs/Devices.htm  
             //Fill the remaining parameters  
             for (int i = 0; i < iExtraArgsCount; i++)
             {
                 args[15 + i] = (string)lstExtraArgs[i];
             }
             //Fill outputfile and inputfile  
             args[15 + iExtraArgsCount] = string.Format("-sOutputFile={0}", outputFile);
             args[16 + iExtraArgsCount] = string.Format("{0}", inputFile);
             return args;
         }
         #endregion
 
 
     }
 }

OCR,识别Image代码(AsPrise辅助类)

 using System;
 using System.Collections.Generic;
 using System.Runtime.InteropServices;
 using System.Text;
 
 namespace PDFCaptureService
 {
     public static class AspriseOCRHandler
     {
         #region 外部引用
         [DllImport("AspriseOCR.dll", EntryPoint = "OCR", CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr OCR(string file, int type);
         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpart", CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr OCRpart(string file, int type, int startX, int
         startY, int width, int height);
         [DllImport("AspriseOCR.dll", EntryPoint = "OCRBarCodes", CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr OCRBarCodes(string file, int type);
         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpartBarCodes", CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr OCRpartBarCodes(string file, int type, int
         startX, int startY, int width, int height);
         #endregion
 
         /// <summary>
         /// 
         /// </summary>
         /// <param name="fileName"></param>
         /// <returns></returns>
         public static string ReadImage(string fileName)
         {
             IntPtr ptrFileContent = OCR(fileName, -1);
             string fileContent = Marshal.PtrToStringAnsi(ptrFileContent);
             //
             return fileContent;
         }
     }
 }

调用示例

 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
                         string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
                         int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
                         ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100);
                         fileContent = AspriseOCRHandler.ReadImage(fileName);


相关评论