Squiz Matrix  4.12.2
 All Data Structures Namespaces Functions Variables Pages
pdf_file.inc
1 <?php
18 require_once SQ_CORE_PACKAGE_PATH.'/files/file/file.inc';
19 require_once SQ_DATA_PATH.'/private/conf/tools.inc';
20 
21 
33 class PDF_File extends File
34 {
35 
36 
41  public $allowed_extensions = Array('pdf');
42 
43 
50  function __construct($assetid=0)
51  {
52  parent::__construct($assetid);
53 
54  }//end constructor
55 
56 
63  function getContent()
64  {
65  $file_info = $this->getExistingFile();
66  if (empty($file_info)) return '';
67  $file = $file_info['path'];
68 
69  $indexer_path = dirname(__FILE__).'/files';
70 
71  $content = '';
72 
73  if (SQ_TOOL_PDFTOHTML_ENABLED) {
74  $pwd = $this->attr('password');
75  $pwd = (empty($pwd)) ? '' : $pwd = ' -opw '.$pwd.' ';
76 
77  // Run a program that converts a PDF to HTML, set environment variable of bin_path
78  $cmd = SQ_TOOL_PDFTOHTML_PATH." -i -nomerge -noframes -stdout $pwd $file";
79  $cmd = escapeshellcmd($cmd);
80  // Run the command
81  $log = Array(
82  'pipe',
83  'w',
84  );
85 
86  $status = FALSE;
87  $spec = Array(
88  0 => Array(
89  'pipe',
90  'r',
91  ), // stdin is a pipe that the child will read from
92  1 => $log, // stdout is a pipe that the child will write to
93  2 => $log, // stderr is a file to write to
94  );
95 
96  $cwd = NULL;
97  $env = Array();
98  $process = proc_open($cmd, $spec, $pipes, $cwd, $env);
99 
100  if (is_resource($process)) {
101  fclose($pipes[0]);
102 
103  // Get the output
104  $content = stream_get_contents($pipes[1]);
105  fclose($pipes[1]);
106 
107  // Find the errors ... do NOT throw to output
108  $errors = stream_get_contents($pipes[2]);
109  if (!empty($errors)) {
110  $errors = array_unique(explode(PHP_EOL, $errors));
111  trigger_error('Error occured while getting contents for pdf File Asset #'.$this->id.' : '.trim(implode(', ', $errors), ', '), E_USER_WARNING);
112  return FALSE;
113  }//end if
114  fclose($pipes[2]);
115 
116  // Finally, get the status of this command
117  $status = proc_close($process);
118 
119  // check to see if the command was executed correctly return value is 0,
120  // all good or else log a warning depending upon the error code returned
121  if ($status != '0') {
122  switch ($status) {
123  case 1:
124  $error_str = 'Invalid Argument.';
125  break;
126 
127  case 2:
128  $error_str = 'Invalid Encoded Output.';
129  break;
130 
131  case 3:
132  $error_str = 'Copying text from this Document is not allowed.';
133  break;
134 
135  case 4:
136  $error_str = 'Error Opening File (e.g. corrupted file).';
137  break;
138 
139  default :
140  $error_str = 'Unknown Error.';
141 
142  }//end switch
143 
144  trigger_error('Error occured while getting contents for pdf File Asset #'.$this->id.' : '.$error_str, E_USER_WARNING);
145  return '';
146  }//end if
147  }//end if
148  }//end if
149  $file = str_replace('"','\\\"',$file);
150  if (empty($content)) return '';
151 
152  // strip out the tags
153  $content = trim(strip_tags($content));
154  if (empty($content)) return '';
155 
156  // in some cases file path ends up as the first string in the file
157  // this is the way pdftohtml treats files without the title, it puts filename into html title
158  if (($this->status & SQ_SC_STATUS_SAFE_EDITING) && ($GLOBALS['SQ_SYSTEM']->user instanceof Public_User)) {
159  // strip file versioning suffix from the filename, e.g. 'file.pdf,ffv5' becomes 'file.pdf'
160  $file = preg_replace('/,ffv\d+$/', '', $file);
161  }
162  // if file extension is .pdf, the .pdf part is omitted by pdftohtml from the first string (strange)
163  // here we prepare the file name for that case
164  if (substr($file, -4) == '.pdf') {
165  $file = substr($file, 0, strlen($file)-4);
166  }
167 
168  // if we suspect that the first string is the file path, remove it
169  // NOTE: this will only work on *nix type filesystems. This is
170  if (isset($content[0]) && $content[0] == '/') {
171  $content = substr($content, strlen($file));
172  }
173 
174  // strip out the unnecessary whitespace and html entities like &amp;
175  $search = Array('/[\s,]+/', '/\&[^;\s]+;/');
176  $replace = Array(' ', '');
177 
178  $content = strtolower(trim(preg_replace($search, $replace, $content)));
179 
180  return $content;
181 
182  }//end getContent()
183 
184 
185 }//end class
186 ?>