Parses genotype and phenotype files and creates records in the database.
This function is only called from Tripal Jobs because of the size of genotype and phenotype files and the number of records needed to properly store and integrate the data.
23 $fourthpage = $form_state[
'saved_values'][
TPPS_PAGE_4];
24 $firstpage = $form_state[
'saved_values'][
TPPS_PAGE_1];
25 $organism_number = $firstpage[
'organism'][
'number'];
26 $file_rank = &$form_state[
'file_rank'];
27 $record_group = variable_get(
'tpps_record_group', 10000);
28 $project_id = $form_state[
'ids'][
'project_id'];
29 $species_codes = array();
30 for ($i = 1; $i <= $organism_number; $i++) {
31 $species_codes[$form_state[
'ids'][
'organism_ids'][$i]] = current(chado_select_record(
'organismprop', array(
'value'), array(
33 'name' =>
'organism 4 letter code',
35 'organism_id' => $form_state[
'ids'][
'organism_ids'][$i],
41 for ($i = 1; $i <= $organism_number; $i++) {
42 $parts = explode(
" ", $firstpage[
'organism'][$i]);
44 $species = implode(
" ", array_slice($parts, 1));
46 if (isset($parts[2]) and ($parts[2] ==
'var.' or $parts[2] ==
'subsp.')) {
47 $infra = implode(
" ", array_slice($parts, 2));
53 if (isset($fourthpage[
"organism-$i"][
'phenotype'])) {
54 $phenotype = $fourthpage[
"organism-$i"][
'phenotype'];
57 $phenotype_cvterms = array(
58 'time' => chado_get_cvterm(array(
62 'desc' => chado_get_cvterm(array(
63 'name' =>
'description',
69 'unit' => chado_get_cvterm(array(
76 'min' => chado_get_cvterm(array(
83 'max' => chado_get_cvterm(array(
93 'phenotype' => array(),
94 'phenotypeprop' => array(),
95 'stock_phenotype' => array(),
100 'records' => $records,
101 'cvterms' => $phenotype_cvterms,
102 'accession' => $form_state[
'accession'],
103 'tree_info' => $form_state[
'tree_info'],
105 'phenotype_count' => $phenotype_count,
108 if (empty($phenotype[
'iso-check'])) {
109 $phenotype_number = $phenotype[
'phenotypes-meta'][
'number'];
110 $phenotypes_meta = array();
113 'project_id' => $project_id,
121 'value' => file_create_url(file_load($phenotype[
'file'])->uri),
122 'rank' => $file_rank,
127 for ($j = 1; $j <= $phenotype_number; $j++) {
128 $name = strtolower($phenotype[
'phenotypes-meta'][$j][
'name']);
129 $phenotypes_meta[$name] = array();
130 $phenotypes_meta[$name][
'attr'] = $phenotype[
'phenotypes-meta'][$j][
'attribute'];
131 $phenotypes_meta[$name][
'desc'] = $phenotype[
'phenotypes-meta'][$j][
'description'];
132 $phenotypes_meta[$name][
'unit'] = $phenotype[
'phenotypes-meta'][$j][
'units'];
133 if ($phenotype[
'phenotypes-meta'][$j][
'struct-check'] ==
'1') {
134 $phenotypes_meta[$name][
'struct'] = $phenotype[
'phenotypes-meta'][$j][
'structure'];
136 if (!empty($phenotype[
'phenotypes-meta'][$j][
'val-check']) or !empty($phenotype[
'phenotypes-meta'][$j][
'bin-check'])) {
137 $phenotypes_meta[$name][
'min'] = $phenotype[
'phenotypes-meta'][$j][
'min'];
138 $phenotypes_meta[$name][
'max'] = $phenotype[
'phenotypes-meta'][$j][
'max'];
140 if ($phenotype[
'phenotypes-meta'][$j][
'time-check'] ==
'1') {
141 $phenotypes_meta[$name][
'time'] = $phenotype[
'phenotypes-meta'][$j][
'time'];
145 if ($phenotype[
'check'] ==
'1') {
147 'project_id' => $project_id,
155 'value' => file_create_url(file_load($phenotype[
'metadata'])->uri),
156 'rank' => $file_rank,
161 $groups = $phenotype[
'metadata-groups'];
162 $column_vals = $phenotype[
'metadata-columns'];
163 $struct = array_search(
'5', $column_vals);
164 $min = array_search(
'6', $column_vals);
165 $max = array_search(
'7', $column_vals);
167 'name' => $groups[
'Phenotype Id'][
'1'],
168 'attr' => $groups[
'Attribute'][
'2'],
169 'desc' => $groups[
'Description'][
'3'],
170 'unit' => $groups[
'Units'][
'4'],
171 'struct' => !empty($struct) ? $struct : NULL,
172 'min' => !empty($min) ? $min : NULL,
173 'max' => !empty($max) ? $max : NULL,
176 $meta_options = array(
177 'no_header' => $phenotype[
'metadata-no-header'],
178 'meta_columns' => $columns,
179 'meta' => &$phenotypes_meta,
182 tpps_file_iterator($phenotype[
'metadata'],
'tpps_process_phenotype_meta', $meta_options);
188 $groups = $phenotype[
'file-groups'];
189 $column_vals = $phenotype[
'file-columns'];
190 $time_index = ($phenotype[
'format'] == 0) ?
'2' :
'4';
191 $clone_index = ($phenotype[
'format'] == 0) ?
'3' :
'5';
192 $time = array_search($time_index, $column_vals);
193 $clone = array_search($clone_index, $column_vals);
194 $meta_headers = array(
195 'name' => $groups[
'Phenotype Name/Identifier'][
'2'] ?? NULL,
196 'value' => $groups[
'Phenotype Value(s)'][
'3'] ?? NULL,
197 'time' => !empty($time) ? $time : NULL,
198 'clone' => !empty($clone) ? $clone : NULL,
202 if ($phenotype[
'format'] == 0) {
203 $file_headers =
tpps_file_headers($phenotype[
'file'], $phenotype[
'file-no-header']);
204 $data_columns = array();
205 foreach ($groups[
'Phenotype Data'][
'0'] as $col) {
206 $data_columns[$col] = $file_headers[$col];
208 unset($file_headers);
211 $options[
'no_header'] = $phenotype[
'file-no-header'];
212 $options[
'tree_id'] = $groups[
'Tree Identifier'][
'1'];
213 $options[
'meta_headers'] = $meta_headers;
214 $options[
'data_columns'] = $data_columns ?? NULL;
215 $options[
'meta'] = $phenotypes_meta;
216 $options[
'file_empty'] = $phenotype[
'file-empty'];
222 'project_id' => $project_id,
230 'value' => file_create_url(file_load($phenotype[
'iso'])->uri),
231 'rank' => $file_rank,
235 $options[
'iso'] = TRUE;
236 $options[
'records'] = $records;
237 $options[
'cvterms'] = $phenotype_cvterms;
239 $options[
'meta'] = array(
240 'desc' =>
"Mass Spectrometry",
241 'unit' =>
"intensity (arbitrary units)",
242 'attr_id' => chado_get_cvterm(array(
243 'name' =>
'intensity',
251 unset($options[
'records']);
254 if (isset($fourthpage[
"organism-$i"][
'genotype'])) {
255 $genotype = $fourthpage[
"organism-$i"][
'genotype'];
258 $seq_var_cvterm = chado_get_cvterm(array(
260 'name' =>
'sequence',
262 'name' =>
'sequence_variant',
266 'genotype_call' => array(
268 'table' =>
'feature',
270 'variant_id' =>
'feature_id',
274 'table' =>
'feature',
276 'marker_id' =>
'feature_id',
283 'feature' => array(),
284 'genotype' => array(),
285 'genotype_call' => array(),
286 'stock_genotype' => array(),
289 $multi_insert_options = array(
290 'fk_overrides' => $overrides,
292 'label' =>
'Genotype',
293 'table' =>
'genotype',
298 'records' => $records,
299 'tree_info' => $form_state[
'tree_info'],
300 'species_codes' => $species_codes,
301 'genotype_count' => $genotype_count,
302 'genotype_total' => &$genotype_total,
303 'project_id' => $project_id,
304 'seq_var_cvterm' => $seq_var_cvterm,
305 'overrides' => $overrides,
306 'multi_insert' => $multi_insert_options,
343 if ($genotype[
'ref-genome'] ==
'manual' or $genotype[
'ref-genome'] ==
'manual2' or $genotype[
'ref-genome'] ==
'url') {
344 if ($genotype[
'tripal_fasta'][
'file_upload']) {
346 $assembly_user = $genotype[
'tripal_fasta'][
'file_upload'];
348 'project_id' => $project_id,
356 'value' => file_create_url(file_load($assembly_user)->uri),
357 'rank' => $file_rank,
361 if ($genotype[
'tripal_fasta'][
'file_upload_existing']) {
363 $assembly_user = $genotype[
'tripal_fasta'][
'file_upload_existing'];
365 'project_id' => $project_id,
373 'value' => file_create_url(file_load($assembly_user)->uri),
374 'rank' => $file_rank,
378 if ($genotype[
'tripal_fasta'][
'file_remote']) {
380 $assembly_user = $genotype[
'tripal_fasta'][
'file_remote'];
382 'project_id' => $project_id,
390 'value' => $assembly_user,
391 'rank' => $file_rank,
396 elseif ($genotype[
'ref-genome'] !=
'none') {
398 'project_id' => $project_id,
401 'name' =>
'sequence',
403 'name' =>
'reference_genome',
406 'value' => $genotype[
'ref-genome'],
410 if (!empty($genotype[
'files'][
'file-type'][
'SNPs Genotype Assay'])) {
412 'project_id' => $project_id,
420 'value' => file_create_url(file_load($genotype[
'files'][
'snps-assay'])->uri),
421 'rank' => $file_rank,
425 $options[
'type'] =
'snp';
427 $options[
'marker'] =
'SNP';
428 $options[
'type_cvterm'] = chado_get_cvterm(array(
430 'name' =>
'sequence',
436 tpps_file_iterator($genotype[
'files'][
'snps-assay'],
'tpps_process_genotype_spreadsheet', $options);
439 unset($options[
'records']);
440 $genotype_total += $genotype_count;
444 if (!empty($genotype[
'files'][
'file-type'][
'Assay Design']) and $genotype[
'marker-type'][
'SNPs']) {
446 'project_id' => $project_id,
454 'value' => file_create_url(file_load($genotype[
'files'][
'assay-design'])->uri),
455 'rank' => $file_rank,
460 if (!empty($genotype[
'files'][
'file-type'][
'SSRs/cpSSRs Genotype Spreadsheet'])) {
462 'project_id' => $project_id,
470 'value' => file_create_url(file_load($genotype[
'files'][
'ssrs'])->uri),
471 'rank' => $file_rank,
475 $options[
'type'] =
'ssrs';
476 $options[
'headers'] =
tpps_ssrs_headers($genotype[
'files'][
'ssrs'], $genotype[
'files'][
'ploidy']);
477 $options[
'marker'] = $genotype[
'SSRs/cpSSRs'];
478 $options[
'type_cvterm'] = chado_get_cvterm(array(
480 'name' =>
'sequence',
482 'name' =>
'microsatellite',
486 tpps_file_iterator($genotype[
'files'][
'ssrs'],
'tpps_process_genotype_spreadsheet', $options);
489 unset($options[
'records']);
493 if (!empty($genotype[
'files'][
'file-type'][
'Other Marker Genotype Spreadsheet'])) {
495 'project_id' => $project_id,
503 'value' => file_create_url(file_load($genotype[
'files'][
'other'])->uri),
504 'rank' => $file_rank,
508 $groups = $genotype[
'files'][
'other-groups'];
510 $options[
'type'] =
'other';
512 $options[
'marker'] = $genotype[
'other-marker'];
513 $options[
'type_cvterm'] = chado_get_cvterm(array(
515 'name' =>
'sequence',
517 'name' =>
'genetic_marker',
520 $options[
'tree_id'] = $groups[
'Tree Id'][1];
522 tpps_file_iterator($genotype[
'files'][
'other'],
'tpps_process_genotype_spreadsheet', $options);
525 unset($options[
'records']);
529 if (!empty($genotype[
'files'][
'file-type'][
'VCF'])) {
533 'project_id' => $project_id,
541 'value' => file_create_url(file_load($genotype[
'files'][
'vcf'])->uri),
542 'rank' => $file_rank,
548 $records[
'genotypeprop'] = array();
550 $snp_cvterm = chado_get_cvterm(array(
552 'name' =>
'sequence',
557 $format_cvterm = chado_get_cvterm(array(
564 $qual_cvterm = chado_get_cvterm(array(
566 'name' =>
'sequence',
568 'name' =>
'quality_value',
571 $filter_cvterm = chado_get_cvterm(array(
573 'name' =>
'operation',
575 'name' =>
'Sequence contamination filtering',
578 $freq_cvterm = chado_get_cvterm(array(
580 'name' =>
'sequence',
582 'name' =>
'allelic_frequency',
585 $depth_cvterm = chado_get_cvterm(array(
589 'name' =>
'Read Depth',
592 $n_sample_cvterm = chado_get_cvterm(array(
593 'name' =>
'number_samples',
597 $vcf_file = file_load($genotype[
'files'][
'vcf']);
598 $location = drupal_realpath($vcf_file->uri);
599 $vcf_content = fopen($location,
'r');
602 $current_id = $form_state[
'ids'][
'organism_ids'][$i];
603 $species_code = $species_codes[$current_id];
606 while (($vcf_line = fgets($vcf_content)) !== FALSE) {
607 if ($vcf_line[0] !=
'#') {
609 $vcf_line = explode(
"\t", $vcf_line);
610 $scaffold_id = &$vcf_line[0];
611 $position = &$vcf_line[1];
612 $marker_name = &$vcf_line[2];
613 $ref = &$vcf_line[3];
614 $alt = &$vcf_line[4];
615 $qual = &$vcf_line[5];
616 $filter = &$vcf_line[6];
617 $info = &$vcf_line[7];
619 if (empty($variant_name) or $variant_name ==
'.') {
620 $variant_name =
"{$scaffold_id}{$position}$ref:$alt";
622 $marker_name = $variant_name . $marker;
623 $description =
"$ref:$alt";
624 $genotype_name =
"$marker-$species_code-$scaffold_id-$position";
625 $genotype_desc =
"$marker-$species_code-$scaffold_id-$position-$description";
627 $records[
'feature'][$marker_name] = array(
628 'organism_id' => $current_id,
629 'uniquename' => $marker_name,
630 'type_id' => $seq_var_cvterm,
633 $records[
'feature'][$variant_name] = array(
634 'organism_id' => $current_id,
635 'uniquename' => $variant_name,
636 'type_id' => $seq_var_cvterm,
639 $records[
'genotype'][$genotype_desc] = array(
640 'name' => $genotype_name,
641 'uniquename' => $genotype_desc,
642 'description' => $description,
643 'type_id' => $snp_cvterm,
647 $records[
'genotypeprop'][
"$genotype_desc-format"] = array(
648 'type_id' => $format_cvterm,
651 'genotype' => $genotype_desc,
656 for ($j = 9; $j < count($vcf_line); $j++) {
657 $records[
'genotype_call'][
"{$stocks[$j - 9]}-$genotype_name"] = array(
658 'project_id' => $project_id,
659 'stock_id' => $stocks[$j - 9],
661 'genotype' => $genotype_desc,
662 'variant' => $variant_name,
663 'marker' => $marker_name,
667 $records[
'stock_genotype'][
"{$stocks[$j - 9]}-$genotype_name"] = array(
668 'stock_id' => $stocks[$j - 9],
670 'genotype' => $genotype_desc,
676 $records[
'genotypeprop'][
"$genotype_desc-qual"] = array(
677 'type_id' => $qual_cvterm,
680 'genotype' => $genotype_desc,
685 $records[
'genotypeprop'][
"$genotype_desc-filter"] = array(
686 'type_id' => $filter_cvterm,
687 'value' => ($filter ==
'.') ?
"P" :
"NP",
689 'genotype' => $genotype_desc,
694 $info_vals = explode(
";", $info);
695 foreach ($info_vals as $key => $val) {
696 $parts = explode(
"=", $val);
697 unset($info_vals[$key]);
698 $info_vals[$parts[0]] = isset($parts[1]) ? $parts[1] :
'';
703 if (isset($info_vals[
'AF']) and $info_vals[
'AF'] !=
'') {
704 $records[
'genotypeprop'][
"$genotype_desc-freq"] = array(
705 'type_id' => $freq_cvterm,
706 'value' => $info_vals[
'AF'],
708 'genotype' => $genotype_desc,
715 if (isset($info_vals[
'DP']) and $info_vals[
'DP'] !=
'') {
716 $records[
'genotypeprop'][
"$genotype_desc-depth"] = array(
717 'type_id' => $depth_cvterm,
718 'value' => $info_vals[
'DP'],
720 'genotype' => $genotype_desc,
727 if (isset($info_vals[
'NS']) and $info_vals[
'NS'] !=
'') {
728 $records[
'genotypeprop'][
"$genotype_desc-n_sample"] = array(
729 'type_id' => $n_sample_cvterm,
730 'value' => $info_vals[
'NS'],
732 'genotype' => $genotype_desc,
739 if ($genotype_count > $record_group) {
743 'feature' => array(),
744 'genotype' => array(),
745 'genotype_call' => array(),
746 'genotypeprop' => array(),
747 'stock_genotype' => array(),
752 elseif (preg_match(
'/##FORMAT=/', $vcf_line)) {
753 $format .= substr($vcf_line, 9, -1);
755 elseif (preg_match(
'/#CHROM/', $vcf_line)) {
756 $vcf_line = explode(
"\t", $vcf_line);
757 for ($j = 9; $j < count($vcf_line); $j++) {
758 $stocks[] = $form_state[
'tree_info'][trim($vcf_line[$j])][
'stock_id'];
770 if (isset($fourthpage[
"organism-$i"][
'environment'])) {
771 $environment = $fourthpage[
"organism-$i"][
'environment'];
772 $env_layers_check = isset($environment[
'use_layers']) ? $environment[
'use_layers'] : FALSE;
773 $env_layers = isset($environment[
'env_layers']) ? $environment[
'env_layers'] : FALSE;
774 $env_params = isset($environment[
'env_params']) ? $environment[
'env_params'] : FALSE;
775 $env_number = $environment[
'env_manual'][
'number'];
778 $species_index =
"species-$i";
779 if (empty($form_state[
'saved_values'][
TPPS_PAGE_3][
'tree-accession'][
'check'])) {
780 $species_index =
"species-1";
782 $tree_accession = $form_state[
'saved_values'][
TPPS_PAGE_3][
'tree-accession'][$species_index];
783 $id_col = $tree_accession[
'file-groups'][
'Tree Id'][1];
785 $env_cvterm = chado_get_cvterm(array(
787 'name' =>
'biomaterial_property',
789 'name' =>
'climate_environment',
793 if ($env_layers_check and db_table_exists(
'cartogratree_layers') and db_table_exists(
'cartogratree_fields')) {
794 $layers_params = array();
796 'phenotype' => array(),
797 'phenotype_cvterm' => array(),
798 'stock_phenotype' => array(),
801 foreach ($env_layers as $layer_name => $layer_id) {
802 if (!empty($layer_id) and !empty($env_params[$layer_name])) {
803 $layers_params[$layer_id] = array();
804 $params = $env_params[$layer_name];
805 foreach ($params as $param_name => $param_id) {
806 if (!empty($param_id)) {
807 $layers_params[$layer_id][$param_id] = $param_name;
811 elseif (!empty($layer_id) and preg_match(
'/worldclim_subgroup_(.+)/', $layer_id, $matches)) {
812 $subgroup_id = $matches[1];
813 $layers = db_select(
'cartogratree_layers',
'l')
814 ->fields(
'l', array(
'layer_id'))
815 ->condition(
'subgroup_id', $subgroup_id)
817 while (($layer = $layers->fetchObject())) {
818 $params = db_select(
'cartogratree_fields',
'f')
819 ->fields(
'f', array(
'field_id',
'display_name'))
820 ->condition(
'layer_id', $layer->layer_id)
822 while (($param = $params->fetchObject())) {
823 $layers_params[$layer->layer_id][$param->field_id] = $param->display_name;
830 'no_header' => !empty($tree_accession[
'file-no-header']),
831 'records' => $records,
832 'tree_id' => $tree_accession[
'file-groups'][
'Tree Id'][1],
833 'accession' => $form_state[
'accession'],
834 'tree_info' => $form_state[
'tree_info'],
835 'layers_params' => $layers_params,
836 'env_count' => &$env_count,
837 'env_cvterm' => $env_cvterm,
841 tpps_file_iterator($tree_accession[
'file'],
'tpps_process_environment_layers', $options);
844 unset($options[
'records']);
850 for ($j = 1; $j <= $env_number; $j++) {
851 $current_env = $environment[
'env_manual'][$j];
853 'name' => $current_env[
'name'],
854 'desc' => $current_env[
'description'],
855 'unit' => $current_env[
'units'],
856 'val' => $current_env[
'value'],
861 'phenotype' => array(),
862 'stock_phenotype' => array(),
863 'phenotypeprop' => array(),
867 'no_header' => !empty($tree_accession[
'file-no-header']),
868 'accession' => $form_state[
'accession'],
869 'records' => $records,
870 'env_meta' => $env_meta,
871 'env_count' => $env_count,
873 'tree_info' => $form_state[
'tree_info'],
874 'tree_id' => $tree_accession[
'file-groups'][
'Tree Id'][1],
875 'env_cvterm' => $env_cvterm,
876 'desc_id' => chado_get_cvterm(array(
877 'name' =>
'description',
883 'unit_id' => chado_get_cvterm(array(
892 tpps_file_iterator($tree_accession[
'file'],
'tpps_process_environment_manual', $options);
895 unset($options[
'records']);
tpps_ssrs_headers($fid, $ploidy)
tpps_update_submission(array $state, array $options=array())
tpps_chado_insert_record($table, $records, array $options=array())
tpps_chado_insert_multi(array $record_groups, array $options=array())
tpps_file_headers($fid, $no_header=FALSE)
tpps_load_submission($accession, $state=TRUE)
tpps_other_marker_headers($fid, array $cols)
tpps_file_iterator($fid, $function, array &$options=array())
tpps_refine_phenotype_meta(array &$meta)