Tripal Plant PopGen Submit
file_utils.inc
Go to the documentation of this file.
1 <?php
2 
25 function tpps_parse_file($fid, $max_rows = 0, $no_header = FALSE, $columns = NULL) {
26  $content = array();
27  $options = array(
28  'no_header' => $no_header,
29  'columns' => $columns,
30  'max_rows' => $max_rows,
31  'content' => &$content,
32  );
33 
34  tpps_file_iterator($fid, 'tpps_parse_file_helper', $options);
35  $content['headers'] = tpps_file_headers($fid, $no_header);
36  return $content;
37 }
38 
52 function tpps_parse_file_helper($row, array &$options) {
53  $options['content'][] = $row;
54 }
55 
65 function tpps_file_len($fid) {
66  $file = file_load($fid);
67  $location = tpps_get_location($file->uri);
68  $extension = tpps_get_path_extension($location);
69  $count = 0;
70  $options = array(
71  'count' => &$count,
72  );
73  if ($extension == 'vcf') {
74  $options['skip_prefix'] = '#';
75  }
76  tpps_file_iterator($fid, 'tpps_file_len_helper', $options);
77  return $count;
78 }
79 
92 function tpps_file_len_helper($row, array &$options = array()) {
93  if (empty($options['skip_prefix'])) {
94  $options['count']++;
95  }
96  else {
97  if (substr(current($row), 0, strlen($options['skip_prefix'])) != $options['skip_prefix']) {
98  $options['count']++;
99  }
100  }
101 }
102 
115 function tpps_file_width($fid) {
116  $file = file_load($fid);
117  $location = tpps_get_location($file->uri);
118  $extension = tpps_get_path_extension($location);
119  switch ($extension) {
120  case 'xlsx':
121  return tpps_xlsx_width($location);
122 
123  case 'txt':
124  case 'csv':
125  return tpps_flat_width($location);
126 
127  case 'vcf':
128  case 'gz':
129  return tpps_flat_width($location, "\t");
130 
131  default:
132  throw new Exception('Unrecognized file type');
133  }
134 }
135 
145 function tpps_xlsx_width($location) {
146  $dir = drupal_realpath(TPPS_TEMP_XLSX);
147 
148  $zip = new ZipArchive();
149  $zip->open($location);
150  $zip->extractTo($dir);
151 
152  $data_location = $dir . '/xl/worksheets/sheet1.xml';
153 
154  $dimension = tpps_xlsx_get_dimension($data_location);
155  preg_match('/([A-Z]+)[0-9]+:([A-Z]+)[0-9]+/', $dimension, $matches);
156  $left_hex = unpack('H*', $matches[1]);
157  $hex = $left_hex[1];
158  $right_hex = unpack('H*', $matches[2]);
159 
160  $len = 0;
161  while (base_convert($hex, 16, 10) <= base_convert($right_hex[1], 16, 10)) {
162  $len++;
163  $hex = tpps_increment_hex($hex);
164  }
165  return $len;
166 }
167 
179 function tpps_flat_width($location, $delim = ',') {
180  $original = ini_get('auto_detect_line_endings');
181  if (!$original) {
182  ini_set('auto_detect_line_endings', TRUE);
183  }
184 
185  $longest = 0;
186  $handle = gzopen($location, 'r');
187 
188  while (($line = gzgets($handle))) {
189  $line = str_getcsv($line, $delim);
190  $longest = max($longest, count($line));
191  }
192 
193  fclose($handle);
194  ini_set('auto_detect_line_endings', $original);
195  return $longest;
196 }
197 
216 function tpps_increment_hex($hex) {
217  $add = 1;
218  $new = "";
219  for ($i = strlen(pack('H*', $hex)) - 1; $i >= 0; $i--) {
220  $num = base_convert(substr($hex, (2 * $i), 2), 16, 10) + $add;
221  if ($num > 90) {
222  $num = 65;
223  }
224  else {
225  $add = 0;
226  }
227  $new = base_convert($num, 10, 16) . $new;
228  if ($i == 0 and $add) {
229  $new = '41' . $new;
230  }
231  }
232  return $new;
233 }
234 
251 function tpps_convert_colname($column) {
252  $total = 0;
253  $base_16 = unpack('H*', 'A')[1];
254  $base = base_convert($base_16, 16, 10) - 1;
255  for ($i = 0; $i < strlen($column); $i++) {
256  $total *= 26;
257  $num_16 = unpack('H*', $column[$i])[1];
258  $total += base_convert($num_16, 16, 10) - $base;
259  }
260  return $total - 1;
261 }
262 
279 function tpps_rename_file($fid, $new_name, array $options = array()) {
280  if (!array_key_exists('preserve', $options)) {
281  $options['preserve'] = FALSE;
282  }
283 
284  if (!empty($fid) and ($file = file_load($fid))) {
285  $new_filename = $new_name . "." . tpps_get_path_extension($file->uri);
286  if (!preg_match('/^(.*\/)(.*)$/', $new_name, $matches)) {
287  preg_match('/^(.*\/).*$/', $file->uri, $matches);
288  $new_filename = $matches[1] . $new_filename;
289  }
290 
291  if ($options['preserve']) {
292  $file->status = FILE_STATUS_PERMANENT;
293  file_save($file);
294  // Check if file already exists then return that file.
295  $result = db_query('SELECT f.fid
296  FROM {file_managed} f WHERE f.uri = :uri', array(':uri' => $new_filename));
297  $record = $result->fetchObject();
298  if(isset($record)) {
299  return file_load($record->fid);
300  }
301  return file_copy($file, $new_filename, FILE_EXISTS_RENAME);
302  }
303  // Check if file already exists then return that file.
304  $result = db_query('SELECT f.fid
305  FROM {file_managed} f WHERE f.uri = :uri', array(':uri' => $new_filename));
306  $record = $result->fetchObject();
307 
308  if(isset($record->fid)) {
309  // print_r("FID:" . $record->fid . "\n");
310  return file_load($record->fid);
311  }
312  $file_object = file_move($file, $new_filename, FILE_EXISTS_RENAME);
313  // print_r("FILE OBJECT:\n");
314  // print_r($file_object);
315  // print_r("\n");
316  return $file_object;
317  }
318  throw new Exception("Error: could not open file");
319 }
320 
334 function tpps_parse_file_column($fid, $column, $no_header = FALSE) {
335  $content = array();
336  $options = array(
337  'no_header' => $no_header,
338  'columns' => array($column),
339  'content' => &$content,
340  );
341  tpps_file_iterator($fid, 'tpps_parse_file_column_helper', $options);
342  return $content;
343 }
344 
357 function tpps_parse_file_column_helper($row, array &$options) {
358  $options['content'][] = $row[current($options['columns'])];
359 }
360 
379 function tpps_compare_files($fid_1, $fid_2, $file_1_id_name, $file_2_id_name, $file_1_no_header = FALSE, $file_2_no_header = FALSE) {
380  $missing = array();
381  $file_1_content = array_unique(tpps_parse_file_column($fid_1, $file_1_id_name, $file_1_no_header));
382  $file_2_content = array_unique(tpps_parse_file_column($fid_2, $file_2_id_name, $file_2_no_header));
383  asort($file_1_content);
384  asort($file_2_content);
385  reset($file_1_content);
386  reset($file_2_content);
387 
388  while (current($file_1_content) !== FALSE and current($file_2_content) !== FALSE) {
389  if (current($file_1_content) < current($file_2_content)) {
390  $missing[] = current($file_1_content);
391  next($file_1_content);
392  continue;
393  }
394  elseif (current($file_1_content) > current($file_2_content)) {
395  next($file_2_content);
396  continue;
397  }
398  next($file_1_content);
399  next($file_2_content);
400  continue;
401  }
402 
403  while (current($file_1_content) !== FALSE) {
404  $missing[] = current($file_1_content);
405  next($file_1_content);
406  }
407  return $missing;
408 }
409 
442 function tpps_file_validate_columns(array &$form_state, array $required_groups, array $file_element) {
443  $cols = $file_element['#value']['columns'];
444 
445  $parents = $file_element['#parents'];
446  $new_end_columns = end($parents) . "-columns";
447  $new_end_no_header = end($parents) . "-no-header";
448  $new_end_empty = end($parents) . "-empty";
449  $group_path = array_pop($parents) . "-groups";
450  $values = drupal_array_get_nested_value($form_state['values'], $parents);
451  // Initialize form column values in form state.
452  $values[$new_end_columns] = array();
453  // Hold onto the location of the columns in form state.
454  $state_column_values = &$values[$new_end_columns];
455  $values[$new_end_no_header] = isset($file_element['#value']['no-header']) ? $file_element['#value']['no-header'] : NULL;
456  $values[$new_end_empty] = isset($file_element['#value']['empty']) ? $file_element['#value']['empty'] : NULL;
457 
458  $title_parts = explode(':', $file_element['#title']);
459  $error_prompt = $title_parts[0];
460 
461  $groups = array();
462  $required_groups_flat = array();
463  foreach ($required_groups as $group => $combinations) {
464  $groups[$group] = array();
465  $required_groups_flat[$group] = array();
466  foreach ($combinations as $name => $combination) {
467  $required_groups_flat[$group] = array_merge($required_groups_flat[$group], $combination);
468  }
469  }
470 
471  foreach ($cols as $name => $type) {
472  $state_column_values[$name] = $type;
473  foreach ($required_groups_flat as $group => $types) {
474  if (in_array($type, $types)) {
475  if (!isset($groups[$group][$type])) {
476  $groups[$group][$type] = array($name);
477  }
478  else {
479  $groups[$group][$type][] = $name;
480  }
481  break;
482  }
483  }
484  }
485  // dpm($required_groups);
486  foreach ($required_groups as $group => $combinations) {
487  $group_valid = FALSE;
488  $groups[$group]['#type'] = array();
489  foreach ($combinations as $name => $combination) {
490  $combination_valid = TRUE;
491  foreach ($combination as $type) {
492  if (!isset($groups[$group][$type])) {
493  $combination_valid = FALSE;
494  break;
495  }
496  }
497  if ($combination_valid) {
498  $groups[$group]['#type'][] = $name;
499  $group_valid = TRUE;
500  }
501  }
502 
503  if (!$group_valid) {
504  form_set_error($file_element['#name'] . "[columns][$group", "$error_prompt: Please specify a column or columns that hold $group.");
505  }
506  }
507 
508  foreach ($groups as $key => $group) {
509  foreach ($group as $opt_num => $col_names) {
510  if (count($col_names) == 1) {
511  $groups[$key][$opt_num] = $col_names[0];
512  }
513  }
514  }
515 
516  $values[$group_path] = $groups;
517  drupal_array_set_nested_value($form_state['values'], $parents, $values);
518 
519  return $groups;
520 }
521 
533 function tpps_save_file_columns(array &$form, array &$form_state) {
534 
535  if ($form_state['stage'] == TPPS_PAGE_3) {
536  for ($i = 1; $i <= $form_state['stats']['species_count']; $i++) {
537  if (($file = file_load($form_state['values']['tree-accession']["species-$i"]['file'])) and $form_state['values']['tree-accession']["species-$i"]['file'] != 0) {
538  file_usage_add($file, 'tpps', 'tpps_project', substr($form_state['accession'], 4));
539  }
540 
541  $form_state['values']['tree-accession']["species-$i"]['file-columns'] = array();
542 
543  if (isset($form['tree-accession']["species-$i"]['file']['#value']['columns'])) {
544  foreach ($form['tree-accession']["species-$i"]['file']['#value']['columns'] as $col => $val) {
545  if ($col[0] != '#') {
546  $form_state['values']['tree-accession']["species-$i"]['file-columns'][$col] = $form['tree-accession']["species-$i"]['file']['#value']['columns'][$col];
547  }
548  }
549  }
550 
551  if (empty($form_state['values']['tree-accession']['check'])) {
552  break;
553  }
554  }
555  }
556  elseif ($form_state['stage'] == TPPS_PAGE_4) {
557  if (isset($form['organism-1']['genotype'])) {
558  for ($i = 1; $i <= $form_state['saved_values'][TPPS_PAGE_1]['organism']['number']; $i++) {
559 
560  if (!empty($form_state['values']["organism-$i"]['genotype']['files']['snps-assay']) and ($file = file_load($form_state['values']["organism-$i"]['genotype']['files']['snps-assay']))) {
561  file_usage_add($file, 'tpps', 'tpps_project', substr($form_state['accession'], 4));
562 
563  $form_state['values']["organism-$i"]['genotype']['files']['snps-assay-columns'] = array();
564 
565  if (isset($form["organism-$i"]['genotype']['files']['snps-assay']['#value']['columns'])) {
566  foreach ($form["organism-$i"]['genotype']['files']['snps-assay']['#value']['columns'] as $col => $val) {
567  if ($col[0] != '#') {
568  $form_state['values']["organism-$i"]['genotype']['files']['snps-assay-columns'][$col] = $form["organism-$i"]['genotype']['files']['snps-assay']['#value']['columns'][$col];
569  }
570  }
571  }
572  }
573 
574  if (!empty($form_state['values']["organism-$i"]['genotype']['files']['other']) and ($file = file_load($form_state['values']["organism-$i"]['genotype']['files']['other']))) {
575  file_usage_add($file, 'tpps', 'tpps_project', substr($form_state['accession'], 4));
576 
577  $form_state['values']["organism-$i"]['genotype']['files']['other-columns'] = array();
578 
579  if (isset($form["organism-$i"]['genotype']['files']['other']['#value']['columns'])) {
580  foreach ($form["organism-$i"]['genotype']['files']['other']['#value']['columns'] as $col => $val) {
581  if ($col[0] != '#') {
582  $form_state['values']["organism-$i"]['genotype']['files']['other-columns'][$col] = $form["organism-$i"]['genotype']['files']['other']['#value']['columns'][$col];
583  }
584  }
585  }
586  }
587  }
588  }
589 
590  if (isset($form['organism-1']['phenotype'])) {
591  for ($i = 1; $i <= $form_state['saved_values'][TPPS_PAGE_1]['organism']['number']; $i++) {
592  if (!empty($form_state['values']["organism-$i"]['phenotype']['normal-check'])) {
593  if (($file = file_load($form_state['values']["organism-$i"]['phenotype']['file'])) and $form_state['values']["organism-$i"]['phenotype']['file'] != 0) {
594  file_usage_add($file, 'tpps', 'tpps_project', substr($form_state['accession'], 4));
595  }
596 
597  $form_state['values']["organism-$i"]['phenotype']['file-columns'] = array();
598 
599  if (isset($form["organism-$i"]['phenotype']['file']['#value']['columns'])) {
600  foreach ($form["organism-$i"]['phenotype']['file']['#value']['columns'] as $col => $val) {
601  if ($col[0] != '#') {
602  $form_state['values']["organism-$i"]['phenotype']['file-columns'][$col] = $form["organism-$i"]['phenotype']['file']['#value']['columns'][$col];
603  }
604  }
605  }
606 
607  if ($form_state['values']["organism-$i"]['phenotype']['check'] != '0') {
608  if (($file = file_load($form_state['values']["organism-$i"]['phenotype']['metadata'])) and $form_state['values']["organism-$i"]['phenotype']['metadata'] != 0) {
609  file_usage_add($file, 'tpps', 'tpps_project', substr($form_state['accession'], 4));
610  }
611 
612  $form_state['values']["organism-$i"]['phenotype']['metadata-columns'] = array();
613 
614  if (isset($form["organism-$i"]['phenotype']['metadata']['#value']['columns'])) {
615  foreach ($form["organism-$i"]['phenotype']['metadata']['#value']['columns'] as $col => $val) {
616  if ($col[0] != '#') {
617  $form_state['values']["organism-$i"]['phenotype']['metadata-columns'][$col] = $form["organism-$i"]['phenotype']['metadata']['#value']['columns'][$col];
618  }
619  }
620  }
621  }
622  }
623  }
624  }
625  }
626 }
627 
640 function tpps_get_location($location) {
641  $location = drupal_realpath($location);
642  $extension = tpps_get_path_extension($location);
643  if ($extension == 'zip' or $extension == 'tar') {
644  $files = tpps_get_archive_files($location);
645  if ($files and (count($files) == 3 or (count($files) == 4 and substr($files[3], -9) === '/__MACOSX')) and !is_dir($files[2])) {
646  $location = $files[2];
647  }
648  }
649  return $location;
650 }
651 
661 function tpps_get_path_extension($path) {
662  preg_match('/\.([a-zA-Z0-9]*)$/', $path, $matches);
663 
664  if(count($matches) == 0) {
665  return ""; // Added by Rish during debugging process. XLSX extracting creates files without paths
666  }
667  return strtolower($matches[1]);
668 
669 }
670 
682 function tpps_xlsx_get_dimension($location) {
683  $reader = new XMLReader();
684  $reader->open($location);
685  while ($reader->read()) {
686  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 'dimension') {
687  $dim = $reader->getAttribute('ref');
688  $reader->close();
689  return $dim;
690  }
691  }
692  return NULL;
693 }
694 
714 function tpps_xlsx_get_rows(array &$readers, array &$strings = array(), $plain = TRUE, $columns = NULL) {
715  $rows = array();
716  $empty = TRUE;
717  foreach ($readers as $key => $r) {
718  $reader = &$readers[$key];
719  $row = tpps_xlsx_get_row($reader, $strings, $plain, $columns);
720  if ($row) {
721  $rows[$key] = $row;
722  $empty = FALSE;
723  }
724  else {
725  $rows[$key] = NULL;
726  }
727  }
728  if ($empty) {
729  return FALSE;
730  }
731  return $rows;
732 }
733 
755 function tpps_xlsx_get_row(&$reader, array &$strings = array(), $plain = TRUE, $columns = NULL) {
756  while ($reader->read()) {
757  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 'row') {
758  if ($plain) {
759  $cells = array();
760  while (!($reader->nodeType == XMLReader::END_ELEMENT and $reader->name == 'row') and $reader->read()) {
761  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 'c') {
762  $cell_type = $reader->getAttribute('t');
763  preg_match('/([A-Z]+)([0-9]+)/', $reader->getAttribute('r'), $matches);
764  $cell_location = $matches[1];
765  if (!empty($columns) and !isset($columns[$cell_location])) {
766  continue;
767  }
768  $cells[$cell_location] = tpps_xlsx_get_cell_value($reader, $cell_type, $strings);
769  }
770  }
771  return $cells;
772  }
773  return TRUE;
774  }
775  }
776  return FALSE;
777 }
778 
791 function tpps_xlsx_translate_date($date) {
792  if (strtotime($date) !== FALSE) {
793  return $date;
794  }
795 
796  if ($date > 60) {
797  $date = $date - 1;
798  return date("m/d/Y", strtotime("12/31/1899 +$date days"));
799  }
800  if ($date < 60) {
801  return date("m/d/Y", strtotime("12/31/1899 +$date days"));
802  }
803 
804  return NULL;
805 }
806 
827 function tpps_xlsx_get_cell_value(&$reader, $cell_type, array &$strings = array()) {
828  if ($reader->isEmptyElement) {
829  return NULL;
830  }
831 
832  while (!($reader->nodeType == XMLReader::END_ELEMENT and ($reader->name == 'c' or $reader->name == 'row')) and $reader->read()) {
833  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 'v') {
834  $value = $reader->readInnerXml();
835  if (!empty($cell_type) and $cell_type == 's') {
836  $value = $strings[$value];
837  }
838  return $value;
839  }
840  }
841  return NULL;
842 }
843 
853 function tpps_xlsx_get_strings($strings_location) {
854  $strings = array();
855  $reader = new XMLReader();
856  $reader->open($strings_location);
857  $count = 0;
858  while ($reader->read()) {
859  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 'si') {
860  $string = "";
861 
862  while (!($reader->nodeType == XMLReader::END_ELEMENT and $reader->name == 'si') and $reader->read()) {
863  if ($reader->nodeType == XMLReader::ELEMENT and $reader->name == 't') {
864  $string .= $reader->readInnerXml();
865  }
866  }
867  $strings[$count++] = $string;
868  }
869  }
870  $reader->close();
871  return $strings;
872 }
873 
884 function tpps_rmdir($dir) {
885  if (is_dir($dir)) {
886  $children = scandir($dir);
887  foreach ($children as $child) {
888  if ($child != '.' and $child != '..') {
889  if (is_dir($dir . '/' . $child) and !is_link($dir . '/' . $child)) {
890  tpps_rmdir($dir . '/' . $child);
891  }
892  else {
893  unlink($dir . '/' . $child);
894  }
895  }
896  }
897  rmdir($dir);
898  }
899 }
900 
911 function tpps_get_archive_files($archive) {
912  $files = array();
913  if (is_object($archive) and !empty($archive->uri)) {
914  $archive = $archive->uri;
915  }
916  $loc = drupal_realpath($archive);
917 
918  $ext = tpps_get_path_extension($loc);
919  $unzip_dir = drupal_realpath('temporary://tpps_tmp');
920  if (is_dir($unzip_dir)) {
921  tpps_rmdir($unzip_dir);
922  }
923 
924  if (is_dir($unzip_dir) or mkdir($unzip_dir)) {
925  switch ($ext) {
926  case 'zip':
927  $zip = new \ZipArchive();
928  $zip->open($loc);
929  $zip->extractTo($unzip_dir);
930  break;
931 
932  case 'gz':
933  $zip = new \Archive_Tar($loc, 'gz');
934  $zip->extract($unzip_dir);
935  break;
936 
937  case 'tar':
938  $zip = new \Archive_Tar($loc);
939  $zip->extract($unzip_dir);
940  break;
941 
942  default:
943  return array();
944  }
945 
946  $dir = $unzip_dir;
947  $files = scandir($dir);
948  if ($files and count($files) == 3 and is_dir($dir . '/' . $files[2])) {
949  $dir .= '/' . $files[2];
950  $files = scandir($dir);
951  }
952  }
953 
954  foreach ($files as $key => $name) {
955  $files[$key] = $dir . '/' . $name;
956  }
957 
958  return $files;
959 }
960 
972 function tpps_file_headers($fid, $no_header = FALSE) {
973  $headers = array();
974  if ($no_header) {
975  $hex = unpack('H*', 'A')[1];
976  $width = tpps_file_width($fid);
977  for ($i = 0; $i < $width; $i++) {
978  $key = pack('H*', $hex);
979  $headers[$key] = $i;
980  $hex = tpps_increment_hex($hex);
981  }
982  return $headers;
983  }
984 
985  $content = array();
986  $options = array(
987  'no_header' => TRUE,
988  'max_rows' => 1,
989  'content' => &$content,
990  );
991  tpps_file_iterator($fid, 'tpps_parse_file_helper', $options);
992  return current($content);
993 }
994 
1011 function tpps_file_iterator($fid, $function, array &$options = array()) {
1012  if (!function_exists($function)) {
1013  return FALSE;
1014  }
1015 
1016  if (!empty($options['job'])) {
1017  $options['job']->setTotalItems(tpps_file_len($fid));
1018  $options['job']->setItemsHandled(0);
1019  }
1020 
1021  $file = file_load($fid);
1022  $file_location = tpps_get_location($file->uri);
1023 
1024  foreach (tpps_file_generator($fid, $options) as $item) {
1025  // Assume the item IS NOT EMPTY
1026  $item_is_not_empty = true;
1027  if(is_array($item)) {
1028  // Check each key value
1029  $values_count = count($item);
1030  $values_empty_count = 0;
1031  foreach($item as $key => $value) {
1032  if($value == '') {
1033  $values_empty_count = $values_empty_count + 1;
1034  }
1035  }
1036  // Check if $values_count == $values_empty_count (this means item is empty)
1037  if ($values_count == $values_empty_count) {
1038  $item_is_not_empty = false;
1039  }
1040  }
1041 
1042  // if the item is not empty, proceed to process the line
1043  if ($item_is_not_empty) {
1044  $function($item, $options);
1045  if (!empty($options['job'])) {
1046  $options['job']->addItemsHandled(1);
1047  }
1048  }
1049  else {
1050  // print_r($item);
1051  echo json_encode($item) . "\n";
1052  echo "[TPPS FILE ITERATOR] Found a line/item that was empty, ignoring this item\n";
1053  echo "[TPPS FILE ITERATOR] File location: " . $file_location . "\n";
1054  // throw new Exception('DEBUG STOP - FOUND AN EMPTY LINE');
1055  }
1056  }
1057  return TRUE;
1058 }
1059 
1074 function tpps_file_generator($fid, array $options = array()) {
1075  $file = file_load($fid);
1076  $location = tpps_get_location($file->uri);
1077  $extension = tpps_get_path_extension($location);
1078  switch ($extension) {
1079  case 'xlsx':
1080  return tpps_xlsx_generator($location, $options);
1081 
1082  case 'txt':
1083  case 'csv':
1084  case 'vcf':
1085  case 'gz':
1086  return tpps_flat_generator($location, $options);
1087 
1088  default:
1089  if (empty($extension)) {
1090  $extension = "NULL";
1091  }
1092  $msg = "Unrecognized file type: $extension";
1093  throw new Exception($msg);
1094  }
1095 }
1096 
1112 function tpps_xlsx_generator($location, array $options = array()) {
1113  $dir = drupal_realpath(TPPS_TEMP_XLSX);
1114  $no_header = $options['no_header'] ?? FALSE;
1115  $columns = $options['columns'] ?? NULL;
1116  $max_rows = $options['max_rows'] ?? NULL;
1117 
1118  if (!empty($columns)) {
1119  $new_columns = array();
1120  foreach ($columns as $col) {
1121  $new_columns[$col] = $col;
1122  }
1123  $columns = $new_columns;
1124  }
1125 
1126  $zip = new ZipArchive();
1127  $zip->open($location);
1128  $zip->extractTo($dir);
1129 
1130  $strings_location = $dir . '/xl/sharedStrings.xml';
1131 
1132  // Find all worksheets.
1133  $sheets = scandir($dir . '/xl/worksheets');
1134  unset($sheets[array_search('.', $sheets)]);
1135  unset($sheets[array_search('..', $sheets)]);
1136 
1137  $strings = tpps_xlsx_get_strings($strings_location);
1138 
1139  // Get dimensions, readers.
1140  $first_left = NULL;
1141  $last_right = NULL;
1142  $dims = array();
1143  $readers = array();
1144  foreach ($sheets as $sheet) {
1145  // dpm('SHEET location: ' . $dir . '/xl/worksheets/' . $sheet);
1146  if (tpps_get_path_extension($dir . '/xl/worksheets/' . $sheet) !== 'xml') {
1147  continue;
1148  }
1149  $loc = $dir . '/xl/worksheets/' . $sheet;
1150  $dimension = tpps_xlsx_get_dimension($loc);
1151  // dpm($loc);
1152  // dpm($dimension);
1153  preg_match('/([A-Z]+)[0-9]+:([A-Z]+)[0-9]+/', $dimension, $matches);
1154  // dpm($matches);
1155  if(count($matches) < 3) {
1156  // The dimensions are invalid (probably an empty worksheet)
1157  }
1158  else {
1159  // Matches found dimensions, continue processing this as a valid sheet
1160  $left_hex = unpack('H*', $matches[1]);
1161  $hex = $left_hex[1];
1162  $right_hex = unpack('H*', $matches[2]);
1163  if ($first_left == NULL) {
1164  $first_left = $left_hex[1];
1165  $last_right = $left_hex[1] - 1;
1166  }
1167  while (base_convert($hex, 16, 10) <= base_convert($right_hex[1], 16, 10)) {
1168  $hex = tpps_increment_hex($hex);
1169  $last_right = tpps_increment_hex($last_right);
1170  }
1171  $dims[] = array($left_hex[1], $right_hex[1]);
1172  $reader = new XMLReader();
1173  $reader->open($loc);
1174  $readers[] = $reader;
1175  }
1176  }
1177 
1178  // If the file has a header row, skip it.
1179  if (!$no_header) {
1180  tpps_xlsx_get_rows($readers, $strings);
1181  }
1182 
1183  // Iterate through file.
1184  $count = 0;
1185  while (($rows = tpps_xlsx_get_rows($readers, $strings, TRUE, $columns))) {
1186  if (!empty($max_rows) and $count >= $max_rows) {
1187  break;
1188  }
1189  $count++;
1190 
1191  $values = array();
1192  $key_hex = $first_left;
1193  foreach ($readers as $idx => $read) {
1194  $row = $rows[$idx];
1195  if (!empty($row)) {
1196  ksort($row);
1197  }
1198  $hex = $dims[$idx][0];
1199  while (base_convert($hex, 16, 10) <= base_convert($dims[$idx][1], 16, 10)) {
1200  $key = pack('H*', $key_hex);
1201  $row_key = pack('H*', $hex);
1202  if (empty($columns) or array_search($key, $columns) !== FALSE) {
1203  $values[$key] = isset($row[$row_key]) ? trim($row[$row_key]) : NULL;
1204  }
1205  $hex = tpps_increment_hex($hex);
1206  $key_hex = tpps_increment_hex($key_hex);
1207  }
1208  }
1209  yield $values;
1210  }
1211 
1212  // Close readers.
1213  foreach ($readers as $reader) {
1214  $reader->close();
1215  }
1216  tpps_rmdir($dir);
1217 }
1218 
1234 function tpps_flat_generator($location, array $options = array()) {
1235  $original = ini_get('auto_detect_line_endings');
1236  if (!$original) {
1237  ini_set('auto_detect_line_endings', TRUE);
1238  }
1239 
1240  $no_header = $options['no_header'] ?? FALSE;
1241  $columns = $options['columns'] ?? NULL;
1242  $max_rows = $options['max_rows'] ?? NULL;
1243 
1244  $delim = ',';
1245  $longest = tpps_flat_width($location, $delim);
1246  $handle = gzopen($location, 'r');
1247 
1248  if (!$no_header) {
1249  gzgets($handle);
1250  }
1251  $count = 0;
1252 
1253  while (($line = gzgets($handle))) {
1254  $vals = str_getcsv($line, $delim);
1255  if (!empty($max_rows) and $count >= $max_rows) {
1256  break;
1257  }
1258  $count++;
1259 
1260  $values = array();
1261 
1262  if (empty($columns)) {
1263  $hex = unpack('H*', 'A')[1];
1264  for ($i = 0; $i < $longest; $i++) {
1265  $key = pack('H*', $hex);
1266  $values[$key] = isset($vals[$i]) ? trim($vals[$i]) : NULL;
1267  if (isset($values[$key]) and !check_plain($values[$key])) {
1268  $values[$key] = trim(mb_convert_encoding($values[$key], "UTF-8", "Windows-1252"));
1269  }
1270  $hex = tpps_increment_hex($hex);
1271  }
1272  yield $values;
1273  continue;
1274  }
1275 
1276  foreach ($columns as $column) {
1277  $values[$column] = $vals[tpps_convert_colname($column)] ?? NULL;
1278  }
1279 
1280  yield $values;
1281  }
1282 
1283  fclose($handle);
1284  ini_set('auto_detect_line_endings', $original);
1285 }
const TPPS_PAGE_1
Definition: tpps.module:12
tpps_flat_generator($location, array $options=array())
tpps_xlsx_get_rows(array &$readers, array &$strings=array(), $plain=TRUE, $columns=NULL)
Definition: file_utils.inc:714
tpps_get_archive_files($archive)
Definition: file_utils.inc:911
tpps_file_width($fid)
Definition: file_utils.inc:115
tpps_increment_hex($hex)
Definition: file_utils.inc:216
const TPPS_TEMP_XLSX
Definition: tpps.module:16
tpps_xlsx_translate_date($date)
Definition: file_utils.inc:791
tpps_flat_width($location, $delim=',')
Definition: file_utils.inc:179
tpps_parse_file_helper($row, array &$options)
Definition: file_utils.inc:52
tpps_xlsx_get_strings($strings_location)
Definition: file_utils.inc:853
tpps_xlsx_get_cell_value(&$reader, $cell_type, array &$strings=array())
Definition: file_utils.inc:827
tpps_get_path_extension($path)
Definition: file_utils.inc:661
tpps_xlsx_get_row(&$reader, array &$strings=array(), $plain=TRUE, $columns=NULL)
Definition: file_utils.inc:755
tpps_file_len($fid)
Definition: file_utils.inc:65
tpps_file_headers($fid, $no_header=FALSE)
Definition: file_utils.inc:972
tpps_save_file_columns(array &$form, array &$form_state)
Definition: file_utils.inc:533
tpps_parse_file_column_helper($row, array &$options)
Definition: file_utils.inc:357
tpps_xlsx_generator($location, array $options=array())
tpps_parse_file_column($fid, $column, $no_header=FALSE)
Definition: file_utils.inc:334
tpps_xlsx_width($location)
Definition: file_utils.inc:145
const TPPS_PAGE_4
Definition: tpps.module:15
tpps_file_len_helper($row, array &$options=array())
Definition: file_utils.inc:92
tpps_xlsx_get_dimension($location)
Definition: file_utils.inc:682
tpps_get_location($location)
Definition: file_utils.inc:640
tpps_convert_colname($column)
Definition: file_utils.inc:251
tpps_rename_file($fid, $new_name, array $options=array())
Definition: file_utils.inc:279
tpps_file_generator($fid, array $options=array())
tpps_file_validate_columns(array &$form_state, array $required_groups, array $file_element)
Definition: file_utils.inc:442
const TPPS_PAGE_3
Definition: tpps.module:14
tpps_file_iterator($fid, $function, array &$options=array())
tpps_compare_files($fid_1, $fid_2, $file_1_id_name, $file_2_id_name, $file_1_no_header=FALSE, $file_2_no_header=FALSE)
Definition: file_utils.inc:379
tpps_parse_file($fid, $max_rows=0, $no_header=FALSE, $columns=NULL)
Definition: file_utils.inc:25
tpps_rmdir($dir)
Definition: file_utils.inc:884