#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2023-12-20 18:38:39 +0200 (Wed, 20 Dec 2023) $
#$Revision: 9867 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.11.0/scripts/cif_mark_disorder $
#------------------------------------------------------------------------------
#*
#* Mark disorder in CIF files judging by distance and occupancy.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use COD::AtomProperties;
use COD::CIF::Data::MarkDisorder qw( mark_disorder );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::Print qw( print_cif );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ToolsVersion qw( get_version_string );

my $Id = '$Id: cif_mark_disorder 9867 2023-12-20 16:38:39Z antanas $';
$Id =~ s/^\$|\$$//g;

my $same_site_distance_sensitivity = 0.000001;
my $same_site_occupancy_sensitivity = 0.01;
my $brick_size = 1;
my $cif_header_file;
my $use_parser = 'c';
my $exclude_zero_occupancies = 1;
my $reconstruct_symmetry = 0;
my $report_marked_disorders = 1;
my $ignore_occupancies = 0;
my $output_dot_assemblies = 0;
my $uniquify_atoms = 0;
my $messages_to_depositor_comments = 1;

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

#* OPTIONS:
#*   -d, --distance-sensitivity  0.000001
#*                     Specify maximum distance between two atoms that should
#*                     be perceived as belonging to the same atom site.
#*                     Default: 0.000001.
#*
#*   --occupancy-sensitivity  0.01
#*                     Set maximum deviation for the sum of occupancies of
#*                     the atoms from the same atom site from 1.
#*                     Default: 0.01.
#*
#*   -h, --add-cif-header
#*                     Comments from the beginning of this file will be
#*                     prepended to the output.
#*
#*   --exclude-zero-occupancies
#*                     Ignore atoms with an occupancy of 0 in the calculations.
#*                     Default.
#*   --no-exclude-zero-occupancies,
#*   --dont-exclude-zero-occupancies
#*                     Use atoms with an occupancy of 0 in the calculations.
#*
#*   --ignore-occupancies
#*                     Do not require occupancies of atoms in the same
#*                     atom site to sum up to 1.
#*   --no-ignore-occupancies,
#*   --dont-ignore-occupancies
#*                     Require occupancies of atoms in the same atom site
#*                     to sum up to 1 to be recognised as a disordered site.
#*                     Default.
#*
#*   --reconstruct-symmetry
#*                     Reconstruct crystal symmetry before searching for
#*                     possible unmarked disorder.
#*   --no-reconstruct-symmetry,
#*   --dont-reconstruct-symmetry
#*                     Do not reconstruct symmetry, act on asymmetric unit
#*                     cell only. Default.
#*
#*   --output-dot-assemblies
#*                     Use dot ('.') assembly names. With this option on,
#*                     already existing dot assemblies in the input CIF
#*                     will be preserved. If no assemblies have existed in
#*                     the input file, but the script found a single new
#*                     assembly, it will be named '.'.
#*   --no-output-dot-assemblies,
#*   --dont-output-dot-assemblies
#*                     Rename dot ('.') assemblies in the input and do not
#*                     name new assemblies '.'. Default.
#*
#*   --report-marked-disorders
#*                     Print each of the marked disorder assemblies to
#*                     stderr, listing atom labels. Default.
#*   --no-report-marked-disorders,
#*   --dont-report-marked-disorders
#*                     Do not print marked disorder assemblies to stderr.
#*
#*   --add-depositor-comments
#*                     Append reports about newly marked disorder assemblies
#*                     together with the signature of this script to the
#*                     '_cod_depositor_comments' value. Default.
#*   --no-add-depositor-comments,
#*   --dont-add-depositor-comments
#*                     Do not append anything to the value of
#*                     '_cod_depositor_comments'.
#*
#*   --brick-size  1
#*                     Brick size parameter for the 'AtomBricks' algorithm.
#*
#*   --uniquify-atoms
#*                     Makes unique the labels of atoms.
#*   --no-uniquify-atoms, --dont-uniquify-atoms
#*                     Do not makes unique labels for atoms,
#*                     exclude duplicates. Default.
#*
#*   --continue-on-errors
#*                     Do not terminate script if errors are raised.
#*   --die-on-errors
#*                     Terminate script immediately if errors are raised.
#*                     Default.
#*   --continue-on-warnings
#*                     Do not terminate script if warnings are raised. Default.
#*   --die-on-warnings
#*                     Terminate script immediately if warnings are raised.
#*   --continue-on-notes
#*                     Do not terminate script if notes are raised. Default.
#*   --die-on-notes
#*                     Terminate script immediately if notes are raised.
#*
#*   --use-perl-parser
#*                     Use Perl parser for CIF parsing.
#*   --use-c-parser
#*                     Use Perl & C parser for CIF parsing. Default.
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    "-d,--distance-sensitivity" => \$same_site_distance_sensitivity,
    "--occupancy-sensitivity" => \$same_site_occupancy_sensitivity,

    "-h,--add-cif-header" => \$cif_header_file,

    "--exclude-zero-occupancies"      => sub { $exclude_zero_occupancies = 1 },
    "--no-exclude-zero-occupancies"   => sub { $exclude_zero_occupancies = 0 },
    "--dont-exclude-zero-occupancies" => sub { $exclude_zero_occupancies = 0 },

    "--ignore-occupancies"      => sub { $ignore_occupancies = 1 },
    "--no-ignore-occupancies"   => sub { $ignore_occupancies = 0 },
    "--dont-ignore-occupancies" => sub { $ignore_occupancies = 0 },

    '--reconstruct-symmetry'      => sub { $reconstruct_symmetry = 1 },
    '--no-reconstruct-symmetry'   => sub { $reconstruct_symmetry = 0 },
    '--dont-reconstruct-symmetry' => sub { $reconstruct_symmetry = 0 },

    "--output-dot-assemblies"      => sub { $output_dot_assemblies = 1 },
    "--no-output-dot-assemblies"   => sub { $output_dot_assemblies = 0 },
    "--dont-output-dot-assemblies" => sub { $output_dot_assemblies = 0 },

    "--report-marked-disorders" => sub { $report_marked_disorders = 1 },
    "--no-report-marked-disorders" =>
        sub { $report_marked_disorders = 0 },
    "--dont-report-marked-disorders" =>
        sub { $report_marked_disorders = 0 },

    "--add-depositor-comments" =>
        sub { $messages_to_depositor_comments = 1 },
    "--no-add-depositor-comments" =>
        sub { $messages_to_depositor_comments = 0 },
    "--dont-add-depositor-comments" =>
        sub { $messages_to_depositor_comments = 0 },

    "--brick-size" => \$brick_size,

    '--uniquify-atoms'      => sub { $uniquify_atoms = 1 },
    '--no-uniquify-atoms'   => sub { $uniquify_atoms = 0 },
    '--dont-uniquify-atoms' => sub { $uniquify_atoms = 0 },

    '--continue-on-errors'          => sub { $die_on_errors = 0 },
    '--die-on-errors'               => sub { $die_on_errors = 1 },

    '--continue-on-warnings' => sub { $die_on_warnings = 0 },
    '--die-on-warnings'      => sub { $die_on_warnings = 1 },

    '--continue-on-notes'    => sub { $die_on_notes = 0 },
    '--die-on-notes'         => sub { $die_on_notes = 1 },

    "--use-perl-parser" => sub { $use_parser = "perl" },
    "--use-c-parser"    => sub { $use_parser = "c" },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

my $cif_header;
eval {
    if( $cif_header_file ) {
        open( my $header, '<', $cif_header_file ) or die 'ERROR, '
            . 'could not open CIF header file for reading -- '
            . lcfirst($!) . "\n";

        $cif_header = '';
        while( <$header> ) {
            last unless /^#/;
            $cif_header .= $_;
        }

        close( $header ) or die 'ERROR, '
            . 'error while closing CIF header file after reading -- '
            . lcfirst($!) . "\n";

        # The header must not contain CIF 2.0 magic code. For CIF 2.0
        # files the magic code is printed explicitly before the header.
        $cif_header =~ s/^#\\#CIF_2\.0[ \t]*\n//;
    }
};
if ($@) {
    process_errors( {
      'message'  => $@,
      'program'  => $0,
      'filename' => $cif_header_file
    }, $die_on_errors );
};

@ARGV = ("-") unless @ARGV;

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

for my $filename (@ARGV) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    next if $err_count > 0;

    canonicalize_all_names( $data );

    if( $cif_header ) {
        # Ensure that for CIF v2.0 the magic code comes
        # before the CIF comment header:
        if( grep { exists $_->{cifversion} &&
                          $_->{cifversion}{major} == 2 } @$data ) {
            print "#\\#CIF_2.0\n";
        }
        print $cif_header;
    }

    for my $dataset (@$data) {

        my $dataname = 'data_' . $dataset->{'name'};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        eval {
            mark_disorder( $dataset,
                           \%COD::AtomProperties::atoms,
                           { same_site_distance_sensitivity =>
                                $same_site_distance_sensitivity,
                             same_site_occupancy_sensitivity =>
                                $same_site_occupancy_sensitivity,
                             brick_size => $brick_size,
                             exclude_zero_occupancies =>
                                $exclude_zero_occupancies,
                             reconstruct_symmetry =>
                                $reconstruct_symmetry,
                             report_marked_disorders =>
                                $report_marked_disorders,
                             ignore_occupancies => $ignore_occupancies,
                             messages_to_depositor_comments =>
                                $messages_to_depositor_comments,
                             no_dot_assembly => !$output_dot_assemblies,
                             uniquify_atoms => $uniquify_atoms,
                             depositor_comments_signature => $Id } );
            print_cif( $dataset,
                       {
                            preserve_loop_order => 1,
                            keep_tag_order => 1
                       }
                     );
        };
        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
    }
}
