PDF in PHP ausgeben

This site uses cookies. By continuing to browse this site, you are agreeing to our Cookie Policy.

  • Source Code

    1. // Author: http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
    2. function pdf2text($filename) {
    3. // Read the data from pdf file
    4. $infile = @file_get_contents($filename, FILE_BINARY);
    5. if (empty($infile))
    6. return "";
    7. // Get all text data.
    8. $transformations = array();
    9. $texts = array();
    10. // Get the list of all objects.
    11. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    12. $objects = @$objects[1];
    13. // Select objects with streams.
    14. for ($i = 0; $i < count($objects); $i++) {
    15. $currentObject = $objects[$i];
    16. // Check if an object includes data stream.
    17. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    18. $stream = ltrim($stream[1]);
    19. // Check object parameters and look for text data.
    20. $options = getObjectOptions($currentObject);
    21. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    22. continue;
    23. // So, we have text data. Decode it.
    24. $data = getDecodedStream($stream, $options);
    25. if (strlen($data)) {
    26. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    27. $textContainers = @$textContainers[1];
    28. getDirtyTexts($texts, $textContainers);
    29. } else
    30. getCharTransformations($transformations, $data);
    31. }
    32. }
    33. }
    34. // Analyze text blocks taking into account character transformations and return results.
    35. return getTextUsingTransformations($texts, $transformations);
    36. }
    Display All


    Für das nächste Mal: Selber suchen macht schlau.
    =O
  • Hallo,
    Ich wollte Ihnen mitteilen das es nicht funktioniert!
    Queltext:

    Source Code

    1. <?php
    2. function pdf2text($filename) {
    3. // Read the data from pdf file
    4. $infile = @file_get_contents($filename, FILE_BINARY);
    5. if (empty($infile))
    6. return "";
    7. // Get all text data.
    8. $transformations = array();
    9. $texts = array();
    10. // Get the list of all objects.
    11. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    12. $objects = @$objects[1];
    13. // Select objects with streams.
    14. for ($i = 0; $i < count($objects); $i++) {
    15. $currentObject = $objects[$i];
    16. // Check if an object includes data stream.
    17. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    18. $stream = ltrim($stream[1]);
    19. // Check object parameters and look for text data.
    20. $options = getObjectOptions($currentObject);
    21. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    22. continue;
    23. // So, we have text data. Decode it.
    24. $data = getDecodedStream($stream, $options);
    25. if (strlen($data)) {
    26. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    27. $textContainers = @$textContainers[1];
    28. getDirtyTexts($texts, $textContainers);
    29. } else
    30. getCharTransformations($transformations, $data);
    31. }
    32. }
    33. }
    34. // Analyze text blocks taking into account character transformations and return results.
    35. return getTextUsingTransformations($texts, $transformations);
    36. }
    37. echo pdf2text("test.pdf");
    38. ?>
    Display All
  • booten99 wrote:

    Hallo,
    Ich wollte Ihnen mitteilen das es nicht funktioniert!

    :thumbsup:


    Hat sich das Problem gelöst ? Wenn nicht, dann stell bitte eine Frage und wenn etwas nicht funktioniert, dann beschreib bitte was genau nicht funktioniert.


    Es könnte sein, dass die Funktion nicht funktioniert, da noch Abhängigkeiten bestehen. Auf der verlinkten Seite von bastey gibt es aber die vollständige Source.

    Source Code

    1. <?php
    2. function decodeAsciiHex($input) {
    3. $output = "";
    4. $isOdd = true;
    5. $isComment = false;
    6. for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
    7. $c = $input[$i];
    8. if($isComment) {
    9. if ($c == '\r' || $c == '\n')
    10. $isComment = false;
    11. continue;
    12. }
    13. switch($c) {
    14. case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
    15. case '%':
    16. $isComment = true;
    17. break;
    18. default:
    19. $code = hexdec($c);
    20. if($code === 0 && $c != '0')
    21. return "";
    22. if($isOdd)
    23. $codeHigh = $code;
    24. else
    25. $output .= chr($codeHigh * 16 + $code);
    26. $isOdd = !$isOdd;
    27. break;
    28. }
    29. }
    30. if($input[$i] != '>')
    31. return "";
    32. if($isOdd)
    33. $output .= chr($codeHigh * 16);
    34. return $output;
    35. }
    36. function decodeAscii85($input) {
    37. $output = "";
    38. $isComment = false;
    39. $ords = array();
    40. for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
    41. $c = $input[$i];
    42. if($isComment) {
    43. if ($c == '\r' || $c == '\n')
    44. $isComment = false;
    45. continue;
    46. }
    47. if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
    48. continue;
    49. if ($c == '%') {
    50. $isComment = true;
    51. continue;
    52. }
    53. if ($c == 'z' && $state === 0) {
    54. $output .= str_repeat(chr(0), 4);
    55. continue;
    56. }
    57. if ($c < '!' || $c > 'u')
    58. return "";
    59. $code = ord($input[$i]) & 0xff;
    60. $ords[$state++] = $code - ord('!');
    61. if ($state == 5) {
    62. $state = 0;
    63. for ($sum = 0, $j = 0; $j < 5; $j++)
    64. $sum = $sum * 85 + $ords[$j];
    65. for ($j = 3; $j >= 0; $j--)
    66. $output .= chr($sum >> ($j * 8));
    67. }
    68. }
    69. if ($state === 1)
    70. return "";
    71. elseif ($state > 1) {
    72. for ($i = 0, $sum = 0; $i < $state; $i++)
    73. $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
    74. for ($i = 0; $i < $state - 1; $i++)
    75. $ouput .= chr($sum >> ((3 - $i) * 8));
    76. }
    77. return $output;
    78. }
    79. function decodeFlate($input) {
    80. return @gzuncompress($input);
    81. }
    82. function getObjectOptions($object) {
    83. $options = array();
    84. if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
    85. $options = explode("/", $options[1]);
    86. @array_shift($options);
    87. $o = array();
    88. for ($j = 0; $j < @count($options); $j++) {
    89. $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
    90. if (strpos($options[$j], " ") !== false) {
    91. $parts = explode(" ", $options[$j]);
    92. $o[$parts[0]] = $parts[1];
    93. } else
    94. $o[$options[$j]] = true;
    95. }
    96. $options = $o;
    97. unset($o);
    98. }
    99. return $options;
    100. }
    101. function getDecodedStream($stream, $options) {
    102. $data = "";
    103. if (empty($options["Filter"]))
    104. $data = $stream;
    105. else {
    106. $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
    107. $_stream = substr($stream, 0, $length);
    108. foreach ($options as $key => $value) {
    109. if ($key == "ASCIIHexDecode")
    110. $_stream = decodeAsciiHex($_stream);
    111. if ($key == "ASCII85Decode")
    112. $_stream = decodeAscii85($_stream);
    113. if ($key == "FlateDecode")
    114. $_stream = decodeFlate($_stream);
    115. }
    116. $data = $_stream;
    117. }
    118. return $data;
    119. }
    120. function getDirtyTexts(&$texts, $textContainers) {
    121. for ($j = 0; $j < count($textContainers); $j++) {
    122. if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
    123. $texts = array_merge($texts, @$parts[1]);
    124. elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
    125. $texts = array_merge($texts, @$parts[1]);
    126. }
    127. }
    128. function getCharTransformations(&$transformations, $stream) {
    129. preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
    130. preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
    131. for ($j = 0; $j < count($chars); $j++) {
    132. $count = $chars[$j][1];
    133. $current = explode("\n", trim($chars[$j][2]));
    134. for ($k = 0; $k < $count && $k < count($current); $k++) {
    135. if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
    136. $transformations[str_pad($map[1], 4, "0")] = $map[2];
    137. }
    138. }
    139. for ($j = 0; $j < count($ranges); $j++) {
    140. $count = $ranges[$j][1];
    141. $current = explode("\n", trim($ranges[$j][2]));
    142. for ($k = 0; $k < $count && $k < count($current); $k++) {
    143. if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
    144. $from = hexdec($map[1]);
    145. $to = hexdec($map[2]);
    146. $_from = hexdec($map[3]);
    147. for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
    148. $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
    149. } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
    150. $from = hexdec($map[1]);
    151. $to = hexdec($map[2]);
    152. $parts = preg_split("#\s+#", trim($map[3]));
    153. for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
    154. $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
    155. }
    156. }
    157. }
    158. }
    159. function getTextUsingTransformations($texts, $transformations) {
    160. $document = "";
    161. for ($i = 0; $i < count($texts); $i++) {
    162. $isHex = false;
    163. $isPlain = false;
    164. $hex = "";
    165. $plain = "";
    166. for ($j = 0; $j < strlen($texts[$i]); $j++) {
    167. $c = $texts[$i][$j];
    168. switch($c) {
    169. case "<":
    170. $hex = "";
    171. $isHex = true;
    172. break;
    173. case ">":
    174. $hexs = str_split($hex, 4);
    175. for ($k = 0; $k < count($hexs); $k++) {
    176. $chex = str_pad($hexs[$k], 4, "0");
    177. if (isset($transformations[$chex]))
    178. $chex = $transformations[$chex];
    179. $document .= html_entity_decode("&#x".$chex.";");
    180. }
    181. $isHex = false;
    182. break;
    183. case "(":
    184. $plain = "";
    185. $isPlain = true;
    186. break;
    187. case ")":
    188. $document .= $plain;
    189. $isPlain = false;
    190. break;
    191. case "\\":
    192. $c2 = $texts[$i][$j + 1];
    193. if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
    194. elseif ($c2 == "n") $plain .= '\n';
    195. elseif ($c2 == "r") $plain .= '\r';
    196. elseif ($c2 == "t") $plain .= '\t';
    197. elseif ($c2 == "b") $plain .= '\b';
    198. elseif ($c2 == "f") $plain .= '\f';
    199. elseif ($c2 >= '0' && $c2 <= '9') {
    200. $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
    201. $j += strlen($oct) - 1;
    202. $plain .= html_entity_decode("&#".octdec($oct).";");
    203. }
    204. $j++;
    205. break;
    206. default:
    207. if ($isHex)
    208. $hex .= $c;
    209. if ($isPlain)
    210. $plain .= $c;
    211. break;
    212. }
    213. }
    214. $document .= "\n";
    215. }
    216. return $document;
    217. }
    218. function pdf2text($filename) {
    219. $infile = @file_get_contents($filename, FILE_BINARY);
    220. if (empty($infile))
    221. return "";
    222. $transformations = array();
    223. $texts = array();
    224. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    225. $objects = @$objects[1];
    226. for ($i = 0; $i < count($objects); $i++) {
    227. $currentObject = $objects[$i];
    228. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
    229. $stream = ltrim($stream[1]);
    230. $options = getObjectOptions($currentObject);
    231. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
    232. continue;
    233. $data = getDecodedStream($stream, $options);
    234. if (strlen($data)) {
    235. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
    236. $textContainers = @$textContainers[1];
    237. getDirtyTexts($texts, $textContainers);
    238. } else
    239. getCharTransformations($transformations, $data);
    240. }
    241. }
    242. }
    243. return getTextUsingTransformations($texts, $transformations);
    244. }
    245. ?>
    Display All