Extract Content using the SDK ContentExtractorFactory |
This topic and sub-topics show how to use the Open Discover SDK content extractor factory pattern to extract content from documents and items from archives and mail store containers.
The .NET assemblies that make up Open Discover SDK are x64 release builds (not AnyCPU) due to x64 dependencies. Therefore, applications that reference and use the SDK assemblies MUST also be x64 builds. |
The steps to extract document content or to extract items from archive or mail store containers:
Method ContentExtractorFactory.GetContentExtractor takes a ContentExtractionSettings object as an argument. The ContentExtractorSettings class determines what content is extracted, determines whether hashes are calculated, if extracted text has languages in text identified, and more. |
The following are all of the IContentExtractor derived interfaces:
Diagram of the IContentExtractor derived interfaces:
The following example shows the pattern of how to get the specific IContentExtractor derived interfaces. How to use the specific derived interfaces is discussed in this section's sub-topics.
1var settings = new ContentExtractionSettings(); 2settings.CalculateBinaryHash = true; // Calculate binary hashes 3settings.CalculateContentHash = true; // Calculate content hashes on supported formats such as emails and Office documents 4settings.ExtractEmbeddedDocuments = true; // Extract attachments\embedded documents 5settings.ExtractOfficeEmbeddedMedia = true; // Extract embedded media for Office formats 6settings.ExtractionType = ContentExtractionType.TextAndMetadata; // We want text and metadata extracted 7 8using (var stream = File.OpenRead(filePath)) 9{ 10 //======================================================================================== 11 // Step 1 - Identify document file format: 12 //======================================================================================== 13 var docIdResult = DocumentIdentifier.Identify(stream, filePath); 14 15 //======================================================================================== 16 // Step 2 - Get the content extractor result for the file format: 17 //======================================================================================== 18 var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, docIdResult, filePath, settings); 19 20 if (contentExtractorResult.HasError) 21 { 22 LogErrorMessage(string.Format("Error getting content extractor for file ID {0}: {1}", docIdResult.ID, contentExtractorResult.Error)); 23 } 24 else 25 { 26 //======================================================================================== 27 // Step 3 - Get the specific IContentExtractor derived interface for the file format: 28 //======================================================================================== 29 var extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType; 30 31 switch (extractorType) 32 { 33 case ContentExtractorType.Archive: 34 { 35 var archiveExtractor = (IArchiveExtractor)contentExtractorResult.ContentExtractor; 36 // TODO: Following help topics will show how to use this interface 37 } 38 break; 39 case ContentExtractorType.Document: 40 { 41 var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); 42 // TODO: Following help topics will show how to use this interface 43 } 44 break; 45 case ContentExtractorType.MailStore: 46 { 47 var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor); 48 // TODO: Following help topics will show how to use this interface 49 } 50 break; 51 case ContentExtractorType.Database: 52 { 53 var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor); 54 // TODO: Following help topics will show how to use this interface 55 } 56 break; 57 case ContentExtractorType.DocumentStore: 58 { 59 var docStoreExtractor = ((IDocumentStoreExtractor)contentExtractorResult.ContentExtractor); 60 // TODO: Following help topics will show how to use this interface 61 } 62 break; 63 case ContentExtractorType.Unsupported: 64 { 65 // Binary-to-text extraction: Note, if property ContentExtractionSettings.BinaryToTextOnUnsupportedTypes is false, then calling 66 // IUnsupportedExtractor.ExtractContent will only calculate binary hashes without performing binary-to-text filtering. 67 // Binary-to-text is not useful for file formats (e.g., Id.MPEG1ElementaryStream) that do not have any textual 68 // content. It is up to the user to filter these formats out using either file format Id or file format classification. 69 var bin2TextExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor); 70 // TODO: Following help topics will show how to use this interface 71 } 72 break; 73 case ContentExtractorType.LargeUnsupported: 74 { 75 // Binary-to-text extraction - we extract to a memory stream here but user should extract to a file stream for the cases 76 // of very 'large' binary documents - because the filtered text could get into the gigabytes 77 // depending on the BLOB size and content. 78 var largeDocBin2TextExtractor = ((ILargeUnsupportedExtractor)contentExtractorResult.ContentExtractor); 79 // TODO: Following help topics will show how to use this interface 80 } 81 break; 82 case ContentExtractorType.LargeEncodedText: 83 { 84 // "large" encoded text file extraction 85 var largeEncodedTextExtractor = ((ILargeEncodedTextExtractor)contentExtractorResult.ContentExtractor); 86 // TODO: Following help topics will show how to use this interface 87 } 88 break; 89 } 90 } 91}