Office pdf Text Extraction
Last Update:2018-12-07
Source: Internet
Author: User
Using System;
Using System. Diagnostics;
Using System. Runtime. InteropServices;
Using System. Text; namespace IFilter
{
[Flags]
Public enum IFILTER_INIT: uint
{
NONE = 0,
CANON_PARAGRAPHS = 1,
HARD_LINE_BREAKS = 2,
CANON_HYPHENS = 4,
CANON_SPACES = 8,
APPLY_INDEX_ATTRIBUTES = 16,
APPLY_CRAWL_ATTRIBUTES = 256,
APPLY_OTHER_ATTRIBUTES = 32,
INDEXING_ONLY = 64,
SEARCH_LINKS = 128,
FILTER_OWNED_VALUE_ OK = 512
} Public enum CHUNK_BREAKTYPE
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
} [Flags]
Public enum CHUNKSTATE
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
} [StructLayout (LayoutKind. Sequential)]
Public struct PROPSPEC
{
Public uint ulKind;
Public uint propid;
Public IntPtr lpwstr;
} [StructLayout (LayoutKind. Sequential)]
Public struct FULLPROPSPEC
{
Public Guid guidPropSet;
Public PROPSPEC psProperty;
} [StructLayout (LayoutKind. Sequential)]
Public struct STAT_CHUNK
{
Public uint idChunk;
[Financialas (UnmanagedType. U4)] public CHUNK_BREAKTYPE breakType;
[Financialas (UnmanagedType. U4)] public CHUNKSTATE flags;
Public uint locale;
[Financialas (UnmanagedType. Struct)] public FULLPROPSPEC attribute;
Public uint idChunkSource;
Public uint cwcStartSource;
Public uint cwcLenSource;
} [StructLayout (LayoutKind. Sequential)]
Public struct FILTERREGION
{
Public uint idChunk;
Public uint cwcStart;
Public uint cwcExtent;
} [ComImport]
[Guid ("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType (ComInterfaceType. InterfaceIsIUnknown)]
Public interface IFilter
{
[PreserveSig]
Int Init ([financialas (UnmanagedType. u4)] IFILTER_INIT grfFlags, uint cAttributes, [financialas (UnmanagedType. LPArray, SizeParamIndex = 1)] FULLPROPSPEC [] aAttributes, ref uint pdwFlags); [PreserveSig]
Int GetChunk (out STAT_CHUNK pStat); [PreserveSig]
Int GetText (ref uint pcwcBuffer, [financialas (UnmanagedType. LPWStr)] StringBuilder buffer); void GetValue (ref UIntPtr ppPropValue );
Void BindRegion ([financialas (UnmanagedType. Struct)] FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk );
} [ComImport]
[Guid ("f07f3920-7b8c-11cf-9be8-00aa004b9986")]
Public class CFilter
{
} Public class IFilterConstants
{
Public const uint PID_STG_DIRECTORY = 0x00000002;
Public const uint PID_STG_CLASSID = 0x00000003;
Public const uint PID_STG_STORAGETYPE = 0x00000004;
Public const uint PID_STG_VOLUME_ID = 0x00000005;
Public const uint PID_STG_PARENT_WORKID = 0x00000006;
Public const uint PID_STG_SECONDARYSTORE = 0x00000007;
Public const uint PID_STG_FILEINDEX = 0x00000008;
Public const uint PID_STG_LASTCHANGEUSN = 0x00000009;
Public const uint PID_STG_NAME = 0x0000000a;
Public const uint PID_STG_PATH = 0x0000000b;
Public const uint PID_STG_SIZE = 0x0000000c;
Public const uint PID_STG_ATTRIBUTES = 0x0000000d;
Public const uint PID_STG_WRITETIME = 0x0000000e;
Public const uint PID_STG_CREATETIME = 0x0000000f;
Public const uint PID_STG_ACCESSTIME = 0x00000010;
Public const uint PID_STG_CHANGETIME = 0x00000011;
Public const uint PID_STG_CONTENTS = 0x00000013;
Public const uint PID_STG_SHORTNAME = 0x00000014;
Public const int FILTER_E_END_OF_CHUNKS = (unchecked (int) 0x80041700 ));
Public const int FILTER_E_NO_MORE_TEXT = (unchecked (int) 0x80041701 ));
Public const int FILTER_E_NO_MORE_VALUES = (unchecked (int) 0x80041702 ));
Public const int FILTER_E_NO_TEXT = (unchecked (int) 0x80041705 ));
Public const int FILTER_E_NO_VALUES = (unchecked (int) 0x80041706 ));
Public const int FILTER_S_LAST_TEXT = (unchecked (int) 0x00041709 ));
}///
/// IFilter return codes
///
Public enum IFilterReturnCodes: uint
{
///
/// Success
///
S_ OK = 0,
///
/// The function was denied access to the filter file.
///
E_ACCESSDENIED = 0x80070005,
///
/// The function encountered an invalid handle, probably due to a low-memory situation.
///
E_HANDLE = 0x80070006,
///
/// The function has ed an invalid parameter.
///
E_INVALIDARG = 0x80070057,
///
/// Out of memory
///
E_OUTOFMEMORY = 0x8007000E,
///
/// Not implemented
///
E_notiml = 0x80004001,
///
/// Unknown error
///
E_FAIL = 0x80000008,
///
/// File not filtered due to password protection
///
FILTER_E_PASSWORD = 0x8000000b,
///
/// The document format is not recognized by the filter
///
FILTER_E_UNKNOWNFORMAT = 0x8001_c,
///
/// No text in current chunk
///
FILTER_E_NO_TEXT = 0x80041705,
///
/// No more chunks of text available in object
///
FILTER_E_END_OF_CHUNKS = 0x80041700,
///
/// No more text available in chunk
///
FILTER_E_NO_MORE_TEXT = 0x80041701,
///
/// No more property values available in chunk
///
FILTER_E_NO_MORE_VALUES = 0x80041702,
///
/// Unable to access object
///
FILTER_E_ACCESS = 0x80041703,
///
/// Moniker doesn't cover entire region
///
FILTER_W_MONIKER_CLIPPED = 0x00041704,
///
/// Unable to bind IFilter for embedded object
///
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
///
/// Unable to bind IFilter for linked object
///
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
///
/// This is the last text in the current chunk
///
FILTER_S_LAST_TEXT = 0x00041709,
///
/// This is the last value in the current chunk
///
FILTER_S_LAST_VALUES = 0x0000000a
}///
/// Convenience class which provides static methods to extract text from files using installed IFilters
///
Public class DefaultParser
{
Public DefaultParser ()
{
} [DllImport ("query. dll", CharSet = CharSet. Unicode)]
Private extern static int LoadIFilter (string pwcsPath, [financialas (UnmanagedType. IUnknown)] object pUnkOuter, ref IFilter ppIUnk); private static IFilter loadIFilter (string filename)
{
Object outer = null;
IFilter filter = null; // Try to load the corresponding IFilter
Int resultLoad = LoadIFilter (filename, outer, ref filter );
If (resultLoad! = (Int) IFilterReturnCodes. S_ OK)
{
Return null;
}
Return filter;
} Public static bool IsParseable (string filename)
{
Return loadIFilter (filename )! = Null;
} Public static string Extract (string path)
{
StringBuilder sb = new StringBuilder ();
IFilter filter = null; try
{
Filter = loadIFilter (path); if (filter = null)
Return String. Empty; uint I = 0;
STAT_CHUNK ps = new STAT_CHUNK (); IFILTER_INIT iflags =
IFILTER_INIT.CANON_HYPHENS |
IFILTER_INIT.CANON_PARAGRAPHS |
IFILTER_INIT.CANON_SPACES |
IFILTER_INIT.APPLY_CRAWL_ATTRIBUTES |
IFILTER_INIT.APPLY_INDEX_ATTRIBUTES |
IFILTER_INIT.APPLY_OTHER_ATTRIBUTES |
IFILTER_INIT.HARD_LINE_BREAKS |
IFILTER_INIT.SEARCH_LINKS |
IFILTER_INIT.FILTER_OWNED_VALUE_ OK; if (filter. Init (iflags, 0, null, ref I )! = (Int) IFilterReturnCodes. S_ OK)
Throw new Exception ("Problem initializing an IFilter for: \ n" + path + "\ n"); while (filter. getChunk (out ps) = (int) (IFilterReturnCodes. s_ OK ))
{
If (ps. flags = CHUNKSTATE. CHUNK_TEXT)
{
IFilterReturnCodes scode = 0;
While (scode = IFilterReturnCodes. S_ OK | scode = IFilterReturnCodes. FILTER_S_LAST_TEXT)
{
Udint pcwcBuffer = 65536;
System. text. stringBuilder sbBuffer = new System. text. stringBuilder (int) pcwcBuffer); scode = (IFilterReturnCodes) filter. getText (ref pcwcBuffer, sbBuffer); if (pcwcBuffer> 0 & sbBuffer. length> 0)
{
If (sbBuffer. Length <pcwcBuffer) // shocould never happen, but it happens!
PcwcBuffer = (uint) sbBuffer. Length; sb. Append (sbBuffer. ToString (0, (int) pcwcBuffer ));
Sb. Append (""); // "\ r \ n"
}}
}}
}
Finally
{
If (filter! = Null)
Marshal. ReleaseComObject (filter );
} Return sb. ToString ();
}
}