How to use the IDocumentContentExtractor Interface |
The section shows how to use the IDocumentContentExtractor interface. This interface is used to extract content from office documents, emails, PDFs, HTML files, raster images, multimedia files, and vector images. The section also shows how to decrypt file formats that are supported for decryption through example.
The IDocumentContentExtractor interface has one method used to extract document content: DocumentContent ExtractContent(string password = null)
The password is optional and is used to decrypt supported types for decryption such as Microsoft Word, Excel, PowerPoint, PDF, and OpenDocument formats.
The following example shows how to use IDocumentContentExtractor interface to extract content from a password encrypted document by prompting the user to enter a password via a dialog box. Optionally, if user knows the passwords of the documents they are going to process the user can set up a password list to cycle through instead of prompting via a dialog box.
Attempting to decrypt a document with a password is an expensive operation by design. This SDK is not optimized to be a password cracker and should not be used as one. Any password lists to cycle through when encountering documents should only contain known passwords that decrypt the password encrypted documents being processed. |
1var settings = new ContentExtractionSettings(); 2settings.CalculateBinaryHash = true; // Calculate binary hashes 3settings.CalculateContentHash = true; // Calculate content hashes on supported formats such as emails and Office documents 4settings.ExtractEmbeddedDocuments = true; // Extract attachments\embedded documents 5settings.ExtractOfficeEmbeddedMedia = true; // Extract embedded media for Office formats 6settings.ExtractionType = ContentExtractionType.TextAndMetadata; // We want text and metadata extracted 7 8using (var stream = File.OpenRead(filePath)) 9{ 10 //======================================================================================== 11 // Step 1 - Identify document file format: 12 //======================================================================================== 13 var docIdResult = DocumentIdentifier.Identify(stream, filePath); 14 15 //======================================================================================== 16 // Step 2 - Get the content extractor result for the file format: 17 //======================================================================================== 18 var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, docIdResult, filePath, settings); 19 20 if (contentExtractorResult.HasError) 21 { 22 LogErrorMessage(string.Format("Error getting content extractor for file ID {0}: {1}", docIdResult.ID, contentExtractorResult.Error)); 23 } 24 else 25 { 26 //======================================================================================== 27 // Step 3 - Get the specific IContentExtractor derived interface for the file format: 28 //======================================================================================== 29 var extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType; 30 31 switch (extractorType) 32 { 33 case ContentExtractorType.Document: 34 { 35 // The "ContentExtractorType.Document" extractor type indicates the IDocumentContentExtractor interface is to be used 36 var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); 37 var docContent = docExtractor.ExtractContent(); 38 39 // Password cycling for supported encrypted documents: 40 // - DocumentContent.Result = ContentResult.WrongPassword, indicating wrong password, because on 1st call to ExtractContent 41 // method above we passed in null (default value) and this document is encrypted 42 // - User can also set up a password list of known passwords to cycle through when encountering encrypted documents. 43 // - Check if "docExtractor.SupportsDecryption" is true, if true then this format supports decryption 44 if (docContent.Result == ContentResult.WrongPassword && docContent.IsEncrypted && docExtractor.SupportsDecryption) 45 { 46 RETRY_DOC_PASSWORD:; 47 48 var passwordDialog = new PasswordForm(); 49 50 if (passwordDialog.ShowDialog() == DialogResult.OK) 51 { 52 // Try the user entered password: 53 docContent = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor).ExtractContent(passwordDialog.Password); 54 55 if (docContent.Result == ContentResult.WrongPassword) 56 { 57 goto RETRY_DOC_PASSWORD; 58 } 59 } 60 } 61 62 if (docContent.Result == ContentResult.Ok) 63 { 64 // Success - Do something with extracted content, for example use Lucene.NET to index the text and index the metadata as fields 65 66 // 67 // Check for special document content classes that derive from DocumentContent class and 68 // have extra content extracted: 69 // 70 if (docContent is EmailDocumentContent) 71 { 72 var emailDocContent = (EmailDocumentContent)docContent; 73 // Do something we extra email related content... 74 } 75 else if (docContent is HtmlDocumentContent) 76 { 77 var htmlDocContent = (HtmlDocumentContent)docContent; 78 // Do something we extra HTML related content... the extra HTML related content such as HyperLinks property 79 // can be used for web-crawling a website 80 } 81 else if (docContent is PdfDocumentContent) 82 { 83 var pdfDocContent = (PdfDocumentContent)docContent; 84 // Do something we extra PDF related content... 85 } 86 } 87 else 88 { 89 // Some level of Error - Documents that have an error often times still have some content extracted such as metadata, 90 // attachments, or text (there are fallback text extraction procedures for some formats and/or binary-text filtering 91 // can be used to extract useful text in a lot of cases) 92 } 93 } 94 break; 95 96 case ContentExtractorType.Archive: 97 { 98 var archiveExtractor = ((IArchiveExtractor)contentExtractorResult.ContentExtractor); 99 // TODO: A different help topic will show how to use this interface 100 } 101 break; 102 case ContentExtractorType.MailStore: 103 { 104 var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor); 105 // TODO: A different help topic will show how to use this interface 106 } 107 break; 108 case ContentExtractorType.Database: 109 { 110 var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor); 111 // TODO: A different help topic will show how to use this interface 112 } 113 break; 114 case ContentExtractorType.DocumentStore: 115 { 116 var docStoreExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); 117 // TODO: A different help topic will show how to use this interface 118 } 119 break; 120 case ContentExtractorType.Unsupported: 121 { 122 var bin2TextExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor); 123 // TODO: A different help topic will show how to use this interface 124 } 125 break; 126 case ContentExtractorType.LargeUnsupported: 127 { 128 var largeDocBin2TextExtractor = ((ILargeUnsupportedExtractor)contentExtractorResult.ContentExtractor); 129 // TODO: A different help topic will show how to use this interface 130 } 131 break; 132 case ContentExtractorType.LargeEncodedText: 133 { 134 var largeEncodedTextExtractor = ((ILargeEncodedTextExtractor)contentExtractorResult.ContentExtractor); 135 // TODO: A different help topic will show how to use this interface 136 } 137 break; 138 } 139 } 140}
The following unit test example code illustrates the various document content extracted like text, metadata, attachments, hashes, and languages identified in the extracted text:
1[TestMethod] 2public void OutlookMsgTextAndMetadataTest() 3{ 4 var settings = new ContentExtractionSettings(); 5 settings.CalculateBinaryHash = true; 6 settings.CalculateContentHash = true; 7 settings.ShowUtcOffsetForTimeText = true; 8 settings.ExtractEmbeddedDocuments = false; 9 settings.ExtractOfficeEmbeddedMedia = false; 10 settings.CollectionTimeZone = TimeZoneInfo.Utc; 11 settings.ExtractionType = ContentExtractionType.TextAndMetadata; 12 13 var file = Path.Combine(_testFilesBasePath, @"Deduplication\Email", "test.msg"); 14 var sentDateUtc = new DateTime(2007, 1, 5, 0, 16, 11, DateTimeKind.Utc); 15 16 using (var stream = File.OpenRead(file)) 17 { 18 var idResult = DocumentIdentifier.Identify(stream, file); 19 var extractorResult = ContentExtractorFactory.GetContentExtractor(stream, idResult, file, settings); 20 21 var extractorType = extractorResult.ContentExtractor.ContentExtractorType; 22 Assert.IsTrue(extractorType == ContentExtractorType.Document); 23 24 var docExtractor = ((IDocumentContentExtractor)extractorResult.ContentExtractor); 25 var docContent = docExtractor.ExtractContent(); 26 27 // Assert that the DocumentContent object is an email type and is safe to type-cast to EmailDocumentContent: 28 Assert.IsTrue(docContent.IsEmailType); 29 var emailContent = docContent as EmailDocumentContent; 30 31 Assert.IsTrue(emailContent.Result == ContentResult.Ok); 32 33 Assert.IsTrue(emailContent.FormatId.ID == Id.OutlookMessage); 34 Assert.IsTrue(emailContent.ExtractedText.StartsWith("From: dummy <dummy@fake.org>")); 35 Assert.IsTrue(emailContent.ExtractedText.IndexOf("Subject: RE: Party") > 0); 36 Assert.IsTrue(emailContent.ExtractedText.IndexOf("Are you going to the party?") > 0); 37 38 Assert.IsTrue(emailContent.ChildDocuments.Count == 1); 39 Assert.IsTrue(emailContent.ChildDocuments[0].Name == "directions.doc"); 40 41 Assert.IsTrue(emailContent.MD5BinaryHash == "471B0C2443F2B2E610B6BF905480DEF5"); 42 Assert.IsTrue(emailContent.SHA1BinaryHash == "86EF0955E35A6BDBF003BE813FBF0395B6D7546F"); 43 // Email types support content hash, the content hash aides in de-duplicating the same email whether they are saved 44 // as .msg, .eml, or .emlx formats. If you save an .eml message as an .msg using Outlook, these 2 files will have 45 // different binary hashes due to different bytes in files. However, the content hash most likely will be the same 46 // for both files. 47 Assert.IsTrue(emailContent.MD5ContentHash == "2F780AEA9DD7930439B340FFBC660A53"); 48 Assert.IsTrue(emailContent.SHA1ContentHash == "EE6B61C9BAEBC87D6F5D2CB33B65EEA9B0F5C3E2"); 49 // Email types have extra hashes calculated: 50 Assert.IsTrue(emailContent.Sha1HeaderHash == "806A056F8F7BAD3FAE31A226B26EE1E3653AF423"); 51 Assert.IsTrue(emailContent.Sha1BodyHash == "6F38C846E0E468732FBCA8CCB9EDDE131A1501A6"); 52 Assert.IsTrue(emailContent.Sha1RecipientsHash == "D8FE882FC2E8AE88D0504D58BBF324C7677BCD8B"); 53 54 Assert.IsTrue(emailContent.Subject == "RE: Party"); 55 56 Assert.IsTrue(emailContent.Sender != null); 57 Assert.IsTrue(emailContent.Sender.Name == "dummy"); 58 Assert.IsTrue(emailContent.Sender.AddressType == EmailAddressType.Sender); 59 Assert.IsTrue(emailContent.Sender.SmtpAddress == "dummy@fake.org"); 60 Assert.IsTrue(emailContent.Sender.X500DN == string.Empty); 61 62 Assert.IsTrue(emailContent.From != null); 63 Assert.IsTrue(emailContent.From.Count == 1); 64 Assert.IsTrue(emailContent.Sender.Name == emailContent.From[0].Name); 65 Assert.IsTrue(emailContent.From[0].AddressType == EmailAddressType.From); 66 Assert.IsTrue(emailContent.Sender.SmtpAddress == emailContent.From[0].SmtpAddress); 67 Assert.IsTrue(emailContent.From[0].X500DN == null); 68 Assert.IsTrue(emailContent.SentDate.Value.CompareTo(sentDateUtc) == 0); 69 70 Assert.IsTrue(emailContent.HasHtmlBody); 71 Assert.IsTrue(emailContent.HasTextBody); 72 Assert.IsTrue(emailContent.HasRtfBody); 73 Assert.IsTrue(emailContent.BodyType == EmailBodyType.RtfEncapsulatedHTML); // HTML body originated from RTF body(was encapsulated in RTF body) 74 Assert.IsTrue(emailContent.MessageId == "<1234567892423424242.fake.org>"); 75 76 // Extracted text language identification results: 77 Assert.IsTrue(emailContent.LanguageIdResults != null); 78 Assert.IsTrue(emailContent.LanguageIdResults.Count == 1); 79 Assert.IsTrue(emailContent.LanguageIdResults[0].LangIso639 == "en"); // English 80 81 Assert.IsTrue(emailContent.Metadata.Count == 34); // 34 email metadata fields extracted in total 82 83 // We use "known" string and datetime metadata helper classes for the metadata field names (in this test it is known that these fields exist 84 // in the test file): 85 Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.From]).Value == "dummy <dummy@fake.org>"); 86 Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderName]).Value == "dummy"); 87 Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SentRepresentingName]).Value == "dummy"); 88 Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderSmtpAddress]).Value == "dummy@fake.org"); 89 Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.To]).Value == "<friendOfdummy@fake.org>"); 90 Assert.IsTrue(((DateTimeProperty)emailContent.Metadata[KnownDateTimeMetadataFields.SentDate]).Value.CompareTo(sentDateUtc) == 0); 91 } 92}