http://www.zorba-xquery.com/modules/read-pdf

View as XML or JSON.

This module provides funtionality to read the text from PDF documents and to render PDF documents to images. Apache PDFBox library is used to implement these functions.


Note: Since this module has a Java library dependency a JVM required to be installed on the system. For Windows: jvm.dll is required on the system path ( usually located in "C:\Program Files\Java\jre6\bin\client".
Note: For Debian based Linux distributions install PdfBox and its dependencies: sudo apt-get install libpdfbox-java libfontbox-java libjempbox-java libcommons-logging-java . For Windows use PDFBOX_HOME environment variable.

Function Summary

extract-text ($pdf as xs:base64Binary, $options as element(rp-options:extract-text-options)?) as xs:string

Extracts the text of the input pdf.

render-to-images ($pdf as xs:base64Binary, $options as element(rp-options:render-to-images-options)?) as xs:base64Binary*

Renders each page of the PDF document as an image.

Functions

extract-text#2

declare  function read-pdf:extract-text($pdf as xs:base64Binary, $options as element(rp-options:extract-text-options)?) as xs:string
Extracts the text of the input pdf.
Please consult the official PDFBox documentation for further information.
Example:
  import module namespace read-pdf = "http://www.zorba-xquery.com/modules/read-pdf";
  import module namespace file = "http://expath.org/ns/file";
  declare namespace rpo =
      "http://www.zorba-xquery.com/modules/read-pdf/read-pdf-options";
  let $pdf := file:read-binary("path/to/my.pdf")
  let $options  :=
     
       simple
       2
       3
       decription_password
       false
       false
       ---start-page-separator---
       ---end-page-separator---
     
  return
      read-pdf:extract-text($pdf, $options)
 

Parameters

pdf as xs:base64Binary
The input PDF instance as xs:base64Binary
options as element(rp-options:extract-text-options)
Options:
  • text-kind: string (default html) html: format output as HTML, simple: plain text.
  • start-page: int (default 1) Which page to start with.
  • end-page: int (default last document page) Which page to end with.
  • password: string The decription password. Optional if PDF is password protected.
  • ignore-corrupt-objects: boolean (default false) If true try recoved in case of corrupt objects, othewise exit with error.
  • ignore-beads: boolean (default false) If true disables the separation by beads.
  • start-page-separator: string The optional separator at every start page.
  • end-page-separator: string The optional separator at every end page.

Returns

xs:string
The text contained in the PDF document.

render-to-images#2

declare  function read-pdf:render-to-images($pdf as xs:base64Binary, $options as element(rp-options:render-to-images-options)?) as xs:base64Binary*
Renders each page of the PDF document as an image.
Please consult the official PDFBox documentation for further information.
Example:
  import module namespace read-pdf = "http://www.zorba-xquery.com/modules/read-pdf";
  import module namespace file = "http://expath.org/ns/file";
  declare namespace rpo =
      "http://www.zorba-xquery.com/modules/read-pdf/read-pdf-options";
  let $pdf := file:read-binary("path/to/my.pdf")
  let $options  :=
     
       jpg
       2
       3
       decription_password
     
  let $imgs := read-pdf:render-to-images($pdf, $options)
  for $img at $pos in $imgs
  return
  {
    file:write-binary("img-page" || $pos || ".jpg", $img);
    $pos
  }
 

Parameters

pdf as xs:base64Binary
The input PDF instance as xs:base64Binary
options as element(rp-options:render-to-images-options)
Options:
  • image-kind: string (default jpg) Image type encoding. Supported encodings: jpg, png.
  • start-page: int (default 1) Which page to start with.
  • end-page: int (default last document page) Which page to end with.
  • password: string The decription password. Optional if PDF is password protected.

Returns

xs:base64Binary*
The rendered pages in the PDF document, as images.