How to use the IDocumentContentExtractor Interface
The section shows how to use the IDocumentContentExtractor interface. This interface is used to extract content from office documents, emails, PDFs, HTML files, raster images, multimedia files, and vector images. The section also shows how to decrypt file formats that are supported for decryption through example.
The IDocumentContentExtractor interface has one method used to extract document content: DocumentContent ExtractContent(string password = null)
The password is optional and is used to decrypt supported types for decryption such as Microsoft Word, Excel, PowerPoint, PDF, and OpenDocument formats.
The following example shows how to use IDocumentContentExtractor interface to extract content from a password encrypted document by prompting the user to enter a password via a dialog box. Optionally, if user knows the passwords of the documents they are going to process the user can set up a password list to cycle through instead of prompting via a dialog box.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
var settings = new ContentExtractionSettings();
settings.CalculateBinaryHash = true; // Calculate binary hashes
settings.CalculateContentHash = true; // Calculate content hashes on supported formats such as emails and Office documents
settings.ExtractEmbeddedDocuments = true; // Extract attachments\embedded documents
settings.ExtractOfficeEmbeddedMedia = true; // Extract embedded media for Office formats
settings.ExtractionType = ContentExtractionType.TextAndMetadata; // We want text and metadata extracted
using (var stream = File.OpenRead(filePath))
{
//========================================================================================
// Step 1 - Identify document file format:
//========================================================================================
var docIdResult = DocumentIdentifier.Identify(stream, filePath);
//========================================================================================
// Step 2 - Get the content extractor result for the file format:
//========================================================================================
var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, docIdResult, filePath, settings);
if (contentExtractorResult.HasError)
{
LogErrorMessage(string.Format("Error getting content extractor for file ID {0}: {1}", docIdResult.ID, contentExtractorResult.Error));
}
else
{
//========================================================================================
// Step 3 - Get the specific IContentExtractor derived interface for the file format:
//========================================================================================
var extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType;
switch (extractorType)
{
case ContentExtractorType.Document:
{
// The "ContentExtractorType.Document" extractor type indicates the IDocumentContentExtractor interface is to be used
var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor);
var docContent = docExtractor.ExtractContent();
// Password cycling for supported encrypted documents:
// - DocumentContent.Result = ContentResult.WrongPassword, indicating wrong password, because on 1st call to ExtractContent
// method above we passed in null (default value) and this document is encrypted
// - User can also set up a password list of known passwords to cycle through when encountering encrypted documents.
// - Check if "docExtractor.SupportsDecryption" is true, if true then this format supports decryption
if (docContent.Result == ContentResult.WrongPassword && docContent.IsEncrypted && docExtractor.SupportsDecryption)
{
RETRY_DOC_PASSWORD:;
var passwordDialog = new PasswordForm();
if (passwordDialog.ShowDialog() == DialogResult.OK)
{
// Try the user entered password:
docContent = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor).ExtractContent(passwordDialog.Password);
if (docContent.Result == ContentResult.WrongPassword)
{
goto RETRY_DOC_PASSWORD;
}
}
}
if (docContent.Result == ContentResult.Ok)
{
// Success - Do something with extracted content, for example use Lucene.NET to index the text and index the metadata as fields
//
// Check for special document content classes that derive from DocumentContent class and
// have extra content extracted:
//
if (docContent is EmailDocumentContent)
{
var emailDocContent = (EmailDocumentContent)docContent;
// Do something we extra email related content...
}
else if (docContent is HtmlDocumentContent)
{
var htmlDocContent = (HtmlDocumentContent)docContent;
// Do something we extra HTML related content... the extra HTML related content such as HyperLinks property
// can be used for web-crawling a website
}
else if (docContent is PdfDocumentContent)
{
var pdfDocContent = (PdfDocumentContent)docContent;
// Do something we extra PDF related content...
}
}
else
{
// Some level of Error - Documents that have an error often times still have some content extracted such as metadata,
// attachments, or text (there are fallback text extraction procedures for some formats and/or binary-text filtering
// can be used to extract useful text in a lot of cases)
}
}
break;
case ContentExtractorType.Archive:
{
var archiveExtractor = ((IArchiveExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.MailStore:
{
var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.Database:
{
var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.DocumentStore:
{
var docStoreExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.Unsupported:
{
var bin2TextExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.LargeUnsupported:
{
var largeDocBin2TextExtractor = ((ILargeUnsupportedExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
case ContentExtractorType.LargeEncodedText:
{
var largeEncodedTextExtractor = ((ILargeEncodedTextExtractor)contentExtractorResult.ContentExtractor);
// TODO: A different help topic will show how to use this interface
}
break;
}
}
}The following unit test example code illustrates the various document content extracted like text, metadata, attachments, hashes, and languages identified in the extracted text:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
[TestMethod]
public void OutlookMsgTextAndMetadataTest()
{
var settings = new ContentExtractionSettings();
settings.CalculateBinaryHash = true;
settings.CalculateContentHash = true;
settings.ShowUtcOffsetForTimeText = true;
settings.ExtractEmbeddedDocuments = false;
settings.ExtractOfficeEmbeddedMedia = false;
settings.CollectionTimeZone = TimeZoneInfo.Utc;
settings.ExtractionType = ContentExtractionType.TextAndMetadata;
var file = Path.Combine(_testFilesBasePath, @"Deduplication\Email", "test.msg");
var sentDateUtc = new DateTime(2007, 1, 5, 0, 16, 11, DateTimeKind.Utc);
using (var stream = File.OpenRead(file))
{
var idResult = DocumentIdentifier.Identify(stream, file);
var extractorResult = ContentExtractorFactory.GetContentExtractor(stream, idResult, file, settings);
var extractorType = extractorResult.ContentExtractor.ContentExtractorType;
Assert.IsTrue(extractorType == ContentExtractorType.Document);
var docExtractor = ((IDocumentContentExtractor)extractorResult.ContentExtractor);
var docContent = docExtractor.ExtractContent();
// Assert that the DocumentContent object is an email type and is safe to type-cast to EmailDocumentContent:
Assert.IsTrue(docContent.IsEmailType);
var emailContent = docContent as EmailDocumentContent;
Assert.IsTrue(emailContent.Result == ContentResult.Ok);
Assert.IsTrue(emailContent.FormatId.ID == Id.OutlookMessage);
Assert.IsTrue(emailContent.ExtractedText.StartsWith("From: dummy <dummy@fake.org>"));
Assert.IsTrue(emailContent.ExtractedText.IndexOf("Subject: RE: Party") > 0);
Assert.IsTrue(emailContent.ExtractedText.IndexOf("Are you going to the party?") > 0);
Assert.IsTrue(emailContent.ChildDocuments.Count == 1);
Assert.IsTrue(emailContent.ChildDocuments[0].Name == "directions.doc");
Assert.IsTrue(emailContent.MD5BinaryHash == "471B0C2443F2B2E610B6BF905480DEF5");
Assert.IsTrue(emailContent.SHA1BinaryHash == "86EF0955E35A6BDBF003BE813FBF0395B6D7546F");
// Email types support content hash, the content hash aides in de-duplicating the same email whether they are saved
// as .msg, .eml, or .emlx formats. If you save an .eml message as an .msg using Outlook, these 2 files will have
// different binary hashes due to different bytes in files. However, the content hash most likely will be the same
// for both files.
Assert.IsTrue(emailContent.MD5ContentHash == "2F780AEA9DD7930439B340FFBC660A53");
Assert.IsTrue(emailContent.SHA1ContentHash == "EE6B61C9BAEBC87D6F5D2CB33B65EEA9B0F5C3E2");
// Email types have extra hashes calculated:
Assert.IsTrue(emailContent.Sha1HeaderHash == "806A056F8F7BAD3FAE31A226B26EE1E3653AF423");
Assert.IsTrue(emailContent.Sha1BodyHash == "6F38C846E0E468732FBCA8CCB9EDDE131A1501A6");
Assert.IsTrue(emailContent.Sha1RecipientsHash == "D8FE882FC2E8AE88D0504D58BBF324C7677BCD8B");
Assert.IsTrue(emailContent.Subject == "RE: Party");
Assert.IsTrue(emailContent.Sender != null);
Assert.IsTrue(emailContent.Sender.Name == "dummy");
Assert.IsTrue(emailContent.Sender.AddressType == EmailAddressType.Sender);
Assert.IsTrue(emailContent.Sender.SmtpAddress == "dummy@fake.org");
Assert.IsTrue(emailContent.Sender.X500DN == string.Empty);
Assert.IsTrue(emailContent.From != null);
Assert.IsTrue(emailContent.From.Count == 1);
Assert.IsTrue(emailContent.Sender.Name == emailContent.From[0].Name);
Assert.IsTrue(emailContent.From[0].AddressType == EmailAddressType.From);
Assert.IsTrue(emailContent.Sender.SmtpAddress == emailContent.From[0].SmtpAddress);
Assert.IsTrue(emailContent.From[0].X500DN == null);
Assert.IsTrue(emailContent.SentDate.Value.CompareTo(sentDateUtc) == 0);
Assert.IsTrue(emailContent.HasHtmlBody);
Assert.IsTrue(emailContent.HasTextBody);
Assert.IsTrue(emailContent.HasRtfBody);
Assert.IsTrue(emailContent.BodyType == EmailBodyType.RtfEncapsulatedHTML); // HTML body originated from RTF body(was encapsulated in RTF body)
Assert.IsTrue(emailContent.MessageId == "<1234567892423424242.fake.org>");
// Extracted text language identification results:
Assert.IsTrue(emailContent.LanguageIdResults != null);
Assert.IsTrue(emailContent.LanguageIdResults.Count == 1);
Assert.IsTrue(emailContent.LanguageIdResults[0].LangIso639 == "en"); // English
Assert.IsTrue(emailContent.Metadata.Count == 34); // 34 email metadata fields extracted in total
// We use "known" string and datetime metadata helper classes for the metadata field names (in this test it is known that these fields exist
// in the test file):
Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.From]).Value == "dummy <dummy@fake.org>");
Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderName]).Value == "dummy");
Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SentRepresentingName]).Value == "dummy");
Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderSmtpAddress]).Value == "dummy@fake.org");
Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.To]).Value == "<friendOfdummy@fake.org>");
Assert.IsTrue(((DateTimeProperty)emailContent.Metadata[KnownDateTimeMetadataFields.SentDate]).Value.CompareTo(sentDateUtc) == 0);
}
}