How to use the IDocumentContentExtractor Interface

The section shows how to use the IDocumentContentExtractor interface. This interface is used to extract content from office documents, emails, PDFs, HTML files, raster images, multimedia files, and vector images. The section also shows how to decrypt file formats that are supported for decryption through example.

The IDocumentContentExtractor interface has one method used to extract document content: DocumentContent ExtractContent(string password = null)

The password is optional and is used to decrypt supported types for decryption such as Microsoft Word, Excel, PowerPoint, PDF, and OpenDocument formats.

The following example shows how to use IDocumentContentExtractor interface to extract content from a password encrypted document by prompting the user to enter a password via a dialog box. Optionally, if user knows the passwords of the documents they are going to process the user can set up a password list to cycle through instead of prompting via a dialog box.

  Note

Attempting to decrypt a document with a password is an expensive operation by design. This SDK is not optimized to be a password cracker and should not be used as one. Any password lists to cycle through when encountering documents should only contain known passwords that decrypt the password encrypted documents being processed.

IDocumentContentExtractor Interface Usage
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
var settings = new ContentExtractionSettings();
settings.CalculateBinaryHash        = true; // Calculate binary hashes
settings.CalculateContentHash       = true; // Calculate content hashes on supported formats such as emails and Office documents
settings.ExtractEmbeddedDocuments   = true; // Extract attachments\embedded documents
settings.ExtractOfficeEmbeddedMedia = true; // Extract embedded media for Office formats
settings.ExtractionType             = ContentExtractionType.TextAndMetadata; // We want text and metadata extracted

using (var stream = File.OpenRead(filePath))
{
    //========================================================================================
    // Step 1 - Identify document file format:
    //========================================================================================
    var docIdResult = DocumentIdentifier.Identify(stream, filePath);

    //========================================================================================
    // Step 2 - Get the content extractor result for the file format:
    //========================================================================================
    var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, docIdResult, filePath, settings);

    if (contentExtractorResult.HasError)
    {
        LogErrorMessage(string.Format("Error getting content extractor for file ID {0}: {1}", docIdResult.ID, contentExtractorResult.Error));
    }
    else
    {
        //========================================================================================
        // Step 3 - Get the specific IContentExtractor derived interface for the file format:
        //========================================================================================
        var extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType;

        switch (extractorType)
        {
              case ContentExtractorType.Document:
                  {
                    // The "ContentExtractorType.Document" extractor type indicates the IDocumentContentExtractor interface is to be used
                    var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor);
                    var docContent   = docExtractor.ExtractContent();

                    // Password cycling for supported encrypted documents: 
                    //  - DocumentContent.Result = ContentResult.WrongPassword, indicating wrong password, because on 1st call to ExtractContent
                    //    method above we passed in null (default value) and this document is encrypted
                    //  - User can also set up a password list of known passwords to cycle through when encountering encrypted documents.
                    //  - Check if "docExtractor.SupportsDecryption" is true, if true then this format supports decryption
                    if (docContent.Result == ContentResult.WrongPassword && docContent.IsEncrypted && docExtractor.SupportsDecryption)
                    {
                        RETRY_DOC_PASSWORD:;

                        var passwordDialog = new PasswordForm();

                        if (passwordDialog.ShowDialog() == DialogResult.OK)
                        {
                            // Try the user entered password:
                            docContent = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor).ExtractContent(passwordDialog.Password);

                            if (docContent.Result == ContentResult.WrongPassword)
                            {
                                goto RETRY_DOC_PASSWORD;
                            }
                        }
                    }

                    if (docContent.Result == ContentResult.Ok)
                    {       
                        // Success - Do something with extracted content, for example use Lucene.NET to index the text and index the metadata as fields

                        //
                        // Check for special document content classes that derive from DocumentContent class and
                        // have extra content extracted:
                        //
                        if (docContent is EmailDocumentContent)
                        {
                           var emailDocContent = (EmailDocumentContent)docContent;
                           // Do something we extra email related content...
                        }
                        else if (docContent is HtmlDocumentContent)
                        {
                           var htmlDocContent = (HtmlDocumentContent)docContent;
                           // Do something we extra HTML related content... the extra HTML related content such as HyperLinks property
                           // can be used for web-crawling a website
                        }
                        else if (docContent is PdfDocumentContent)
                        {
                           var pdfDocContent = (PdfDocumentContent)docContent;
                           // Do something we extra PDF related content...
                        }                                    
                    }
                    else
                    {
                       // Some level of Error - Documents that have an error often times still have some content extracted such as metadata, 
                       // attachments, or text (there are fallback text extraction procedures for some formats and/or binary-text filtering 
                       // can be used to extract useful text in a lot of cases)
                    }
                }
                break;

            case ContentExtractorType.Archive:
                {
                    var archiveExtractor = ((IArchiveExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
            case ContentExtractorType.MailStore:
                {
                    var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
            case ContentExtractorType.Database:
                {
                    var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;                            
            case ContentExtractorType.DocumentStore:
                {
                    var docStoreExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
            case ContentExtractorType.Unsupported:
                {
                    var bin2TextExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
            case ContentExtractorType.LargeUnsupported:
                {
                    var largeDocBin2TextExtractor = ((ILargeUnsupportedExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
            case ContentExtractorType.LargeEncodedText:
                {
                    var largeEncodedTextExtractor = ((ILargeEncodedTextExtractor)contentExtractorResult.ContentExtractor);
                    // TODO: A different help topic will show how to use this interface
                }
                break;
        }
    }
}

The following unit test example code illustrates the various document content extracted like text, metadata, attachments, hashes, and languages identified in the extracted text:

Email Document
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
[TestMethod]
public void OutlookMsgTextAndMetadataTest()
{
    var settings = new ContentExtractionSettings();
    settings.CalculateBinaryHash        = true;
    settings.CalculateContentHash       = true;
    settings.ShowUtcOffsetForTimeText   = true;
    settings.ExtractEmbeddedDocuments   = false;
    settings.ExtractOfficeEmbeddedMedia = false;
    settings.CollectionTimeZone = TimeZoneInfo.Utc;
    settings.ExtractionType     = ContentExtractionType.TextAndMetadata;

    var file        = Path.Combine(_testFilesBasePath, @"Deduplication\Email", "test.msg");
    var sentDateUtc = new DateTime(2007, 1, 5, 0, 16, 11, DateTimeKind.Utc);

    using (var stream = File.OpenRead(file))
    {
        var idResult        = DocumentIdentifier.Identify(stream, file);
        var extractorResult = ContentExtractorFactory.GetContentExtractor(stream, idResult, file, settings);

        var extractorType = extractorResult.ContentExtractor.ContentExtractorType;
        Assert.IsTrue(extractorType == ContentExtractorType.Document);

        var docExtractor = ((IDocumentContentExtractor)extractorResult.ContentExtractor);
        var docContent   = docExtractor.ExtractContent();

        // Assert that the DocumentContent object is an email type and is safe to type-cast to EmailDocumentContent:
        Assert.IsTrue(docContent.IsEmailType);
        var emailContent = docContent as EmailDocumentContent;

        Assert.IsTrue(emailContent.Result == ContentResult.Ok);

        Assert.IsTrue(emailContent.FormatId.ID == Id.OutlookMessage);
        Assert.IsTrue(emailContent.ExtractedText.StartsWith("From: dummy <dummy@fake.org>"));
        Assert.IsTrue(emailContent.ExtractedText.IndexOf("Subject: RE: Party") > 0);
        Assert.IsTrue(emailContent.ExtractedText.IndexOf("Are you going to the party?") > 0);

        Assert.IsTrue(emailContent.ChildDocuments.Count   == 1);
        Assert.IsTrue(emailContent.ChildDocuments[0].Name == "directions.doc");

        Assert.IsTrue(emailContent.MD5BinaryHash      == "471B0C2443F2B2E610B6BF905480DEF5");
        Assert.IsTrue(emailContent.SHA1BinaryHash     == "86EF0955E35A6BDBF003BE813FBF0395B6D7546F");
        // Email types support content hash, the content hash aides in de-duplicating the same email whether they are saved 
        // as .msg, .eml, or .emlx formats. If you save an .eml message as an .msg using Outlook, these 2 files will have
        // different binary hashes due to different bytes in files. However, the content hash most likely will be the same
        // for both files.
        Assert.IsTrue(emailContent.MD5ContentHash     == "2F780AEA9DD7930439B340FFBC660A53");
        Assert.IsTrue(emailContent.SHA1ContentHash    == "EE6B61C9BAEBC87D6F5D2CB33B65EEA9B0F5C3E2");
        // Email types have extra hashes calculated:
        Assert.IsTrue(emailContent.Sha1HeaderHash     == "806A056F8F7BAD3FAE31A226B26EE1E3653AF423");
        Assert.IsTrue(emailContent.Sha1BodyHash       == "6F38C846E0E468732FBCA8CCB9EDDE131A1501A6");
        Assert.IsTrue(emailContent.Sha1RecipientsHash == "D8FE882FC2E8AE88D0504D58BBF324C7677BCD8B");

        Assert.IsTrue(emailContent.Subject == "RE: Party");

        Assert.IsTrue(emailContent.Sender != null);
        Assert.IsTrue(emailContent.Sender.Name        == "dummy");
        Assert.IsTrue(emailContent.Sender.AddressType == EmailAddressType.Sender);
        Assert.IsTrue(emailContent.Sender.SmtpAddress == "dummy@fake.org");
        Assert.IsTrue(emailContent.Sender.X500DN      == string.Empty);

        Assert.IsTrue(emailContent.From        != null);
        Assert.IsTrue(emailContent.From.Count  == 1);
        Assert.IsTrue(emailContent.Sender.Name == emailContent.From[0].Name);
        Assert.IsTrue(emailContent.From[0].AddressType == EmailAddressType.From);
        Assert.IsTrue(emailContent.Sender.SmtpAddress  == emailContent.From[0].SmtpAddress);
        Assert.IsTrue(emailContent.From[0].X500DN      == null);
        Assert.IsTrue(emailContent.SentDate.Value.CompareTo(sentDateUtc) == 0);

        Assert.IsTrue(emailContent.HasHtmlBody);
        Assert.IsTrue(emailContent.HasTextBody);
        Assert.IsTrue(emailContent.HasRtfBody);
        Assert.IsTrue(emailContent.BodyType  == EmailBodyType.RtfEncapsulatedHTML); // HTML body originated from RTF body(was encapsulated in RTF body)
        Assert.IsTrue(emailContent.MessageId == "<1234567892423424242.fake.org>");

        // Extracted text language identification results:
        Assert.IsTrue(emailContent.LanguageIdResults != null);
        Assert.IsTrue(emailContent.LanguageIdResults.Count == 1);
        Assert.IsTrue(emailContent.LanguageIdResults[0].LangIso639 == "en"); // English

        Assert.IsTrue(emailContent.Metadata.Count == 34); // 34 email metadata fields extracted in total

        // We use "known" string and datetime metadata helper classes for the metadata field names (in this test it is known that these fields exist 
        // in the test file):
        Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.From]).Value       == "dummy <dummy@fake.org>");
        Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderName]).Value == "dummy");
        Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SentRepresentingName]).Value == "dummy");
        Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.SenderSmtpAddress]).Value    == "dummy@fake.org");
        Assert.IsTrue(((StringProperty)emailContent.Metadata[KnownStringMetadataFields.To]).Value                   == "<friendOfdummy@fake.org>");
        Assert.IsTrue(((DateTimeProperty)emailContent.Metadata[KnownDateTimeMetadataFields.SentDate]).Value.CompareTo(sentDateUtc) == 0);
    }
}

See Also