fuyun
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/rich_content_extraction.rst‎
Lines changed: 68 additions & 0 deletions b/‎docs/rich_content_extraction.rst‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎docs/searchbackend_api.rst‎
Lines changed: 11 additions & 0 deletions b/‎docs/searchbackend_api.rst‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎haystack/backends/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎haystack/backends/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎haystack/backends/solr_backend.py‎
Lines changed: 32 additions & 0 deletions b/‎haystack/backends/solr_backend.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎tests/content_extraction/test.pdf‎
47.1 KB b/‎tests/content_extraction/test.pdf‎
47.1 KB
diff --git a/‎tests/solr_tests/tests/solr_backend.py‎
Lines changed: 21 additions & 0 deletions b/‎tests/solr_tests/tests/solr_backend.py‎
Lines changed: 21 additions & 0 deletions
@@ -60,6 +60,7 @@ you may want to include in your application.
    autocomplete
    boost
    multiple_index
+   rich_content_extraction
 
 
 Reference
 
@@ -0,0 +1,68 @@
+.. _ref-rich_content_extraction:
+
+=======================
+Rich Content Extraction
+=======================
+
+For some projects it is desirable to index text content which is stored in
+structured files such as PDFs, Microsoft Office documents, images, etc.
+Currently only Solr's `ExtractingRequestHandler`_ is directly supported by
+Haystack but the approach below could be used with any backend which supports
+this feature.
+
+.. _`ExtractingRequestHandler`: http://wiki.apache.org/solr/ExtractingRequestHandler
+
+Extracting Content
+==================
+
+:meth:`SearchBackend.extract_file_contents` accepts a file or file-like object
+and returns a dictionary containing two keys: ``metadata`` and ``contents``. The
+``contents`` value will be a string containing all of the text which the backend
+managed to extract from the file contents. ``metadata`` will always be a
+dictionary but the keys and values will vary based on the underlying extraction
+engine and the type of file provided.
+
+Indexing Extracted Content
+==========================
+
+Generally you will want to include the extracted text in your main document
+field along with everything else specified in your search template. This example
+shows how to override a hypothetical ``FileIndex``'s ``prepare`` method to 
+include the extract content along with information retrieved from the database::
+
+    def prepare(self, obj):
+        data = super(FileIndex, self).prepare(obj)
+
+        # This could also be a regular Python open() call, a StringIO instance
+        # or the result of opening a URL. Note that due to a library limitation
+        # file_obj must have a .name attribute even if you need to set one
+        # manually before calling extract_file_contents:
+        file_obj = obj.the_file.open()
+
+        extracted_data = self.backend.extract_file_contents(file_obj)
+
+        # Now we'll finally perform the template processing to render the
+        # text field with *all* of our metadata visible for templating:
+        t = loader.select_template(('search/indexes/myapp/file_text.txt', ))
+        data['text'] = t.render(Context({'object': obj,
+                                         'extracted': extracted_data}))
+
+        return data
+
+This allows you to insert the extracted text at the appropriate place in your
+template, modified or intermixed with database content as appropriate:
+
+.. code-block:: html+django
+
+    {{ object.title }}
+    {{ object.owner.name }}
+
+    …
+
+    {% for k, v in extracted.metadata.items %}
+        {% for val in v %}
+            {{ k }}: {{ val|safe }}
+        {% endfor %}
+    {% endfor %}
+
+    {{ extracted.contents|striptags|safe }}
@@ -70,6 +70,17 @@ results the search backend found.
 This method MUST be implemented by each backend, as it will be highly
 specific to each one.
 
+``extract_file_contents``
+-------------------------
+
+.. method:: SearchBackend.extract_file_contents(self, file_obj)
+
+Perform text extraction on the provided file or file-like object. Returns either
+None or a dictionary containing the keys ``contents`` and ``metadata``. The
+``contents`` field will always contain the extracted text content returned by
+the underlying search engine but ``metadata`` may vary considerably based on
+the backend and the input file.
+
 ``prep_value``
 --------------
 
 
@@ -135,6 +135,23 @@ def more_like_this(self, model_instance, additional_query_string=None, result_cl
         """
         raise NotImplementedError("Subclasses must provide a way to fetch similar record via the 'more_like_this' method if supported by the backend.")
 
+    def extract_file_contents(self, file_obj):
+        """
+        Hook to allow backends which support rich-content types such as PDF,
+        Word, etc. extraction to process the provided file object and return
+        the contents for indexing
+
+        Returns None if metadata cannot be extracted; otherwise returns a
+        dictionary containing at least two keys:
+
+            :contents:
+                        Extracted full-text content, if applicable
+            :metadata:
+                        key:value pairs of text strings
+        """
+
+        raise NotImplementedError("Subclasses must provide a way to extract metadata via the 'extract' method if supported by the backend.")
+
     def build_schema(self, fields):
         """
         Takes a dictionary of fields and returns schema information.
 
@@ -377,6 +377,38 @@ def build_schema(self, fields):
 
         return (content_field_name, schema_fields)
 
+    def extract_file_contents(self, file_obj):
+        """Extract text and metadata from a structured file (PDF, MS Word, etc.)
+
+        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
+        See the Solr wiki for details:
+
+            http://wiki.apache.org/solr/ExtractingRequestHandler
+
+        Due to the way the ExtractingRequestHandler is implemented it completely
+        replaces the normal Haystack indexing process with several unfortunate
+        restrictions: only one file per request, the extracted data is added to
+        the index with no ability to modify it, etc. To simplify the process and
+        allow for more advanced use we'll run using the extract-only mode to
+        return the extracted data without adding it to the index so we can then
+        use it within Haystack's normal templating process.
+
+        Returns None if metadata cannot be extracted; otherwise returns a
+        dictionary containing at least two keys:
+
+            :contents:
+                        Extracted full-text content, if applicable
+            :metadata:
+                        key:value pairs of text strings
+        """
+
+        try:
+            return self.conn.extract(file_obj)
+        except StandardError, e:
+            self.log.warning(u"Unable to extract file contents: %s", e,
+                             exc_info=True, extra={"data": {"file": file_obj}})
+            return None
+
 
 class SolrSearchQuery(BaseSearchQuery):
     def matching_all_fragment(self):
 
@@ -2,6 +2,8 @@
 import datetime
 from decimal import Decimal
 import logging
+import os
+
 import pysolr
 from django.conf import settings
 from django.test import TestCase
@@ -1208,3 +1210,22 @@ def test_boost(self):
             'core.afourthmockmodel.2',
             'core.afourthmockmodel.4'
         ])
+
+
+class LiveSolrContentExtractionTestCase(TestCase):
+    def setUp(self):
+        super(LiveSolrContentExtractionTestCase, self).setUp()
+
+        self.sb = connections['default'].get_backend()
+
+    def test_content_extraction(self):
+        f = open(os.path.join(os.path.dirname(__file__),
+                              "..", "..", "content_extraction", "test.pdf"),
+                 "rb")
+
+        data = self.sb.extract_file_contents(f)
+
+        self.assertTrue("haystack" in data['contents'])
+        self.assertEqual(data['metadata']['Content-Type'], [u'application/pdf'])
+        self.assertTrue(any(i for i in data['metadata']['Keywords'] if 'SolrCell' in i))
+