PDF in PHP ausgeben

Diese Seite verwendet Cookies. Durch die Nutzung unserer Seite erklären Sie sich damit einverstanden, dass wir Cookies setzen. Weitere Informationen

  • Quellcode

    1. // Author: http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
    2. function pdf2text($filename) {
    3. // Read the data from pdf file
    4. $infile = @file_get_contents($filename, FILE_BINARY);
    5. if (empty($infile))
    6. return "";
    7. // Get all text data.
    8. $transformations = array();
    9. $texts = array();
    10. // Get the list of all objects.
    11. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    12. $objects = @$objects[1];
    13. // Select objects with streams.
    14. for ($i = 0; $i < count($objects); $i++) {
    15. $currentObject = $objects[$i];
    16. // Check if an object includes data stream.
    17. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    18. $stream = ltrim($stream[1]);
    19. // Check object parameters and look for text data.
    20. $options = getObjectOptions($currentObject);
    21. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    22. continue;
    23. // So, we have text data. Decode it.
    24. $data = getDecodedStream($stream, $options);
    25. if (strlen($data)) {
    26. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    27. $textContainers = @$textContainers[1];
    28. getDirtyTexts($texts, $textContainers);
    29. } else
    30. getCharTransformations($transformations, $data);
    31. }
    32. }
    33. }
    34. // Analyze text blocks taking into account character transformations and return results.
    35. return getTextUsingTransformations($texts, $transformations);
    36. }
    Alles anzeigen


    Für das nächste Mal: Selber suchen macht schlau.
    =O
  • Hallo,
    Ich wollte Ihnen mitteilen das es nicht funktioniert!
    Queltext:

    Quellcode

    1. <?php
    2. function pdf2text($filename) {
    3. // Read the data from pdf file
    4. $infile = @file_get_contents($filename, FILE_BINARY);
    5. if (empty($infile))
    6. return "";
    7. // Get all text data.
    8. $transformations = array();
    9. $texts = array();
    10. // Get the list of all objects.
    11. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    12. $objects = @$objects[1];
    13. // Select objects with streams.
    14. for ($i = 0; $i < count($objects); $i++) {
    15. $currentObject = $objects[$i];
    16. // Check if an object includes data stream.
    17. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    18. $stream = ltrim($stream[1]);
    19. // Check object parameters and look for text data.
    20. $options = getObjectOptions($currentObject);
    21. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    22. continue;
    23. // So, we have text data. Decode it.
    24. $data = getDecodedStream($stream, $options);
    25. if (strlen($data)) {
    26. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    27. $textContainers = @$textContainers[1];
    28. getDirtyTexts($texts, $textContainers);
    29. } else
    30. getCharTransformations($transformations, $data);
    31. }
    32. }
    33. }
    34. // Analyze text blocks taking into account character transformations and return results.
    35. return getTextUsingTransformations($texts, $transformations);
    36. }
    37. echo pdf2text("test.pdf");
    38. ?>
    Alles anzeigen
  • booten99 schrieb:

    Hallo,
    Ich wollte Ihnen mitteilen das es nicht funktioniert!

    :thumbsup:


    Hat sich das Problem gelöst ? Wenn nicht, dann stell bitte eine Frage und wenn etwas nicht funktioniert, dann beschreib bitte was genau nicht funktioniert.


    Es könnte sein, dass die Funktion nicht funktioniert, da noch Abhängigkeiten bestehen. Auf der verlinkten Seite von bastey gibt es aber die vollständige Source.

    Quellcode

    1. <?php
    2. function decodeAsciiHex($input) {
    3. $output = "";
    4. $isOdd = true;
    5. $isComment = false;
    6. for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
    7. $c = $input[$i];
    8. if($isComment) {
    9. if ($c == '\r' || $c == '\n')
    10. $isComment = false;
    11. continue;
    12. }
    13. switch($c) {
    14. case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
    15. case '%':
    16. $isComment = true;
    17. break;
    18. default:
    19. $code = hexdec($c);
    20. if($code === 0 && $c != '0')
    21. return "";
    22. if($isOdd)
    23. $codeHigh = $code;
    24. else
    25. $output .= chr($codeHigh * 16 + $code);
    26. $isOdd = !$isOdd;
    27. break;
    28. }
    29. }
    30. if($input[$i] != '>')
    31. return "";
    32. if($isOdd)
    33. $output .= chr($codeHigh * 16);
    34. return $output;
    35. }
    36. function decodeAscii85($input) {
    37. $output = "";
    38. $isComment = false;
    39. $ords = array();
    40. for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
    41. $c = $input[$i];
    42. if($isComment) {
    43. if ($c == '\r' || $c == '\n')
    44. $isComment = false;
    45. continue;
    46. }
    47. if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
    48. continue;
    49. if ($c == '%') {
    50. $isComment = true;
    51. continue;
    52. }
    53. if ($c == 'z' && $state === 0) {
    54. $output .= str_repeat(chr(0), 4);
    55. continue;
    56. }
    57. if ($c < '!' || $c > 'u')
    58. return "";
    59. $code = ord($input[$i]) & 0xff;
    60. $ords[$state++] = $code - ord('!');
    61. if ($state == 5) {
    62. $state = 0;
    63. for ($sum = 0, $j = 0; $j < 5; $j++)
    64. $sum = $sum * 85 + $ords[$j];
    65. for ($j = 3; $j >= 0; $j--)
    66. $output .= chr($sum >> ($j * 8));
    67. }
    68. }
    69. if ($state === 1)
    70. return "";
    71. elseif ($state > 1) {
    72. for ($i = 0, $sum = 0; $i < $state; $i++)
    73. $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
    74. for ($i = 0; $i < $state - 1; $i++)
    75. $ouput .= chr($sum >> ((3 - $i) * 8));
    76. }
    77. return $output;
    78. }
    79. function decodeFlate($input) {
    80. return @gzuncompress($input);
    81. }
    82. function getObjectOptions($object) {
    83. $options = array();
    84. if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
    85. $options = explode("/", $options[1]);
    86. @array_shift($options);
    87. $o = array();
    88. for ($j = 0; $j < @count($options); $j++) {
    89. $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
    90. if (strpos($options[$j], " ") !== false) {
    91. $parts = explode(" ", $options[$j]);
    92. $o[$parts[0]] = $parts[1];
    93. } else
    94. $o[$options[$j]] = true;
    95. }
    96. $options = $o;
    97. unset($o);
    98. }
    99. return $options;
    100. }
    101. function getDecodedStream($stream, $options) {
    102. $data = "";
    103. if (empty($options["Filter"]))
    104. $data = $stream;
    105. else {
    106. $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
    107. $_stream = substr($stream, 0, $length);
    108. foreach ($options as $key => $value) {
    109. if ($key == "ASCIIHexDecode")
    110. $_stream = decodeAsciiHex($_stream);
    111. if ($key == "ASCII85Decode")
    112. $_stream = decodeAscii85($_stream);
    113. if ($key == "FlateDecode")
    114. $_stream = decodeFlate($_stream);
    115. }
    116. $data = $_stream;
    117. }
    118. return $data;
    119. }
    120. function getDirtyTexts(&$texts, $textContainers) {
    121. for ($j = 0; $j < count($textContainers); $j++) {
    122. if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
    123. $texts = array_merge($texts, @$parts[1]);
    124. elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
    125. $texts = array_merge($texts, @$parts[1]);
    126. }
    127. }
    128. function getCharTransformations(&$transformations, $stream) {
    129. preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
    130. preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
    131. for ($j = 0; $j < count($chars); $j++) {
    132. $count = $chars[$j][1];
    133. $current = explode("\n", trim($chars[$j][2]));
    134. for ($k = 0; $k < $count && $k < count($current); $k++) {
    135. if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
    136. $transformations[str_pad($map[1], 4, "0")] = $map[2];
    137. }
    138. }
    139. for ($j = 0; $j < count($ranges); $j++) {
    140. $count = $ranges[$j][1];
    141. $current = explode("\n", trim($ranges[$j][2]));
    142. for ($k = 0; $k < $count && $k < count($current); $k++) {
    143. if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
    144. $from = hexdec($map[1]);
    145. $to = hexdec($map[2]);
    146. $_from = hexdec($map[3]);
    147. for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
    148. $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
    149. } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
    150. $from = hexdec($map[1]);
    151. $to = hexdec($map[2]);
    152. $parts = preg_split("#\s+#", trim($map[3]));
    153. for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
    154. $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
    155. }
    156. }
    157. }
    158. }
    159. function getTextUsingTransformations($texts, $transformations) {
    160. $document = "";
    161. for ($i = 0; $i < count($texts); $i++) {
    162. $isHex = false;
    163. $isPlain = false;
    164. $hex = "";
    165. $plain = "";
    166. for ($j = 0; $j < strlen($texts[$i]); $j++) {
    167. $c = $texts[$i][$j];
    168. switch($c) {
    169. case "<":
    170. $hex = "";
    171. $isHex = true;
    172. break;
    173. case ">":
    174. $hexs = str_split($hex, 4);
    175. for ($k = 0; $k < count($hexs); $k++) {
    176. $chex = str_pad($hexs[$k], 4, "0");
    177. if (isset($transformations[$chex]))
    178. $chex = $transformations[$chex];
    179. $document .= html_entity_decode("&#x".$chex.";");
    180. }
    181. $isHex = false;
    182. break;
    183. case "(":
    184. $plain = "";
    185. $isPlain = true;
    186. break;
    187. case ")":
    188. $document .= $plain;
    189. $isPlain = false;
    190. break;
    191. case "\\":
    192. $c2 = $texts[$i][$j + 1];
    193. if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
    194. elseif ($c2 == "n") $plain .= '\n';
    195. elseif ($c2 == "r") $plain .= '\r';
    196. elseif ($c2 == "t") $plain .= '\t';
    197. elseif ($c2 == "b") $plain .= '\b';
    198. elseif ($c2 == "f") $plain .= '\f';
    199. elseif ($c2 >= '0' && $c2 <= '9') {
    200. $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
    201. $j += strlen($oct) - 1;
    202. $plain .= html_entity_decode("&#".octdec($oct).";");
    203. }
    204. $j++;
    205. break;
    206. default:
    207. if ($isHex)
    208. $hex .= $c;
    209. if ($isPlain)
    210. $plain .= $c;
    211. break;
    212. }
    213. }
    214. $document .= "\n";
    215. }
    216. return $document;
    217. }
    218. function pdf2text($filename) {
    219. $infile = @file_get_contents($filename, FILE_BINARY);
    220. if (empty($infile))
    221. return "";
    222. $transformations = array();
    223. $texts = array();
    224. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    225. $objects = @$objects[1];
    226. for ($i = 0; $i < count($objects); $i++) {
    227. $currentObject = $objects[$i];
    228. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    229. $stream = ltrim($stream[1]);
    230. $options = getObjectOptions($currentObject);
    231. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    232. continue;
    233. $data = getDecodedStream($stream, $options);
    234. if (strlen($data)) {
    235. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    236. $textContainers = @$textContainers[1];
    237. getDirtyTexts($texts, $textContainers);
    238. } else
    239. getCharTransformations($transformations, $data);
    240. }
    241. }
    242. }
    243. return getTextUsingTransformations($texts, $transformations);
    244. }
    245. ?>
    Alles anzeigen