#!/bin/bash

# findup - find duplicate files
# Copyright © 2000-2009 by Pádraig Brady <P@draigBrady.com>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details,
# which is available at www.gnu.org


# Description
#
#   will show duplicate files in the specified directories
#   (and their subdirectories), in the format:
#
#       file1
#       file2
#
#       file3
#       file4
#       file5
#
#   or if the --summary option is specified:
#
#       2 * 2048	file1 file2
#       3 * 1024	file3 file4 file5
#
#   Where the number is the disk usage in bytes of each of the
#   duplicate files on that line, and all duplicate files are
#   shown on the same line.
#       Output it ordered by largest disk usage first and
#   then by the number of duplicate files.
#
# Caveats/Notes:
#   I compared this to any equivalent utils I could find (as of Nov 2000)
#   and it's (by far) the fastest, has the most functionality (thanks to
#   find) and has no (known) bugs. In my opinion fdupes is the next best but
#   is slower (even though written in C), and has a bug where hard links
#   in different directories are reported as duplicates sometimes.
#
#   This script requires uniq > V2.0.21 (part of GNU textutils|coreutils)
#   dir/file names containing \n are ignored
#   undefined operation for dir/file names containing \1
#   sparse files are not treated differently.
#   Don't specify params to find that affect output etc. (e.g -printf etc.)
#   zero length files are ignored.
#   symbolic links are ignored.
#   path1 & path2 can be files &/or directories

script_dir=$(dirname "$0")              #directory of this script
script_dir=$(readlink -f "$script_dir") #Make sure absolute path

. "$script_dir"/supprt/fslver

Usage() {
    ProgName=$(basename "$0")
    echo "find dUPlicate files.
Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...]

If no path(s) specified then the current directory is assumed.


When -m is specified any found duplicates will be merged (using hardlinks).
When -d is specified any found duplicates will be deleted (leaving just 1).
When -t is specfied, only report what -m or -d would do.

When --summary is specified change output format to include file sizes.
You can also pipe this summary format to "$script_dir"/fstool/dupwaste
to get a total of the wastage due to duplicates.

Examples:

search for duplicates in current directory and below
    findup or findup .
search for duplicates in all linux source directories and merge using hardlinks
    findup -m /usr/src/linux*
same as above but don't look in subdirectories
    findup -r .
search for duplicates in /usr/bin
    findup /usr/bin
search in multiple directories but not their subdirectories
    findup -r /usr/bin /bin /usr/sbin /sbin
search for duplicates in \$PATH
    findup \$("$script_dir"/supprt/getffp)
search system for duplicate files over 100K in size
    findup / -size +100k
search only my files (that I own and are in my home dir)
    findup ~ -user \$(id -u)
search system for duplicate files belonging to roger
    findup / -user \$(id -u roger)"
    exit
}

cleanup_sum() {

    sed '
    # md5sum and sha1sum et. al. from coreutils at least,
    # to deal with \n in filenames, convert any \ and \n chars
    # to \\ and \\n respectively. Currently we ignore files with \n
    # so just undo this problematic escaping
    /^\\/{s/.//; s/\\\\/\\/g};

    # These utils also add a "*" flag character for normal files
    # on platforms where O_BINARY is significant (like CYGWIN).
    # We always process in binary mode and so remove that flag here
    s/^\([^ ]*\) \*/\1  /;
    '
}

for arg
do
    case "$arg" in
    -h|--help|-help)
        Usage ;;
    -v|--version)
        Version ;;
    --summary)
        mode="summary" ;;
    -m)
        mode="merge" ;;
    -d)
        mode="del" ;;
    -t)
        t="t" ;;
    *)
        argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;;
    esac
done

sep_mode="separate"

if [ "$mode" = "summary" ]; then
    #Don't do extra hardlink processing.
    #This speeds things up, and also removes the python dependency
    merge_early="-u"
fi

. "$script_dir"/supprt/getfpf "$argsToPassOn"

check_uniq

dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)"
if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then
    devFmt="\060" #0
else
    devFmt=%D #This is new in findutils-4.2 and will help find more duplicates
fi

                                              #print name, dev, inode & size.
find "$@" -size +0c -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" |
sort -u |            #merge files (indirectly) specified multiple times
tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names
sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes
if [ -z "$merge_early" ]; then
    "$script_dir"/supprt/rmlint/merge_hardlinks
else
    uniq -3 -D       #pick just duplicate filesizes
fi |
sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk
cut -f1 -d' ' -s |   #get filenames to work on
tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0

# The following optional block, md5sums a small sample of each file,
# which can help when there are many files of the same size,
# even more so if they are large. This usually adds a small amount of
# runtime, however it can save a large amount of time in certain situations.
if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
    xargs -r0 "$script_dir"/supprt/md5sum_approx |
    sort |                     #group duplicate files together
    uniq --all-repeated -w32 | #pick just duplicates
    cut -d' ' -f3- |           #get filenames
    sort |                     #sort by paths to try to minimise disk seeks
    tr '\n' '\0'               #delimit names with \0
else
    cat
fi |

# This block selects duplicates using md5sum of whole file
xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
cleanup_sum |              #undo any backslash escaping
sort |                     #group duplicate files together
uniq --all-repeated=$sep_mode -w32 | #pick just duplicates

# The following optional block, checks duplicates again using sha1
# Note for data sets that don't totally fit in cache this will
# probably read duplicate files off the disk again.
cut -s -d' ' -f3- |        #get filenames
sort |                     #sort by paths to try to minimise disk seeks
tr '\n' '\0' |             #delimit names with \0
xargs -r0 sha1sum -- |     #to be sure to be sure
cleanup_sum |              #undo any backslash escaping
sort |                     #group duplicate files together
uniq --all-repeated=$sep_mode -w40 | #pick just duplicates

cut -d' ' -f3- |           #get filenames (and leave separating lines)

if [ "$mode" ]; then
  if [ ! $mode = "summary" ]; then # external call to python as this is faster
    if "$script_dir"/supprt/rmlint/fixdup </dev/null 2>/dev/null; then
        "$script_dir"/supprt/rmlint/fixdup $t$mode
    elif "$script_dir"/supprt/rmlint/fixdup.sh </dev/null 2>/dev/null; then
        "$script_dir"/supprt/rmlint/fixdup.sh $t$mode
    else
        echo "Error, couldn't execute merge util" >&2
        exit 1
    fi
  else
    (
    line=''
    declare -i counter #Use bash arithmetic, not expr (for speed)
    counter=0
    while read; do
        # note we don't specify "file" to `read`
        # as otherwise trailing IFS will be stripped
        file="$REPLY"
        if [ ! "$file" ]; then
            if [ ! -z "$line" ]; then
                echo "$counter * $line"
            fi
            counter=0
        else
            if [ $counter -eq 0 ]; then
                line=$(du -B1 "$file")
            else
                line="$line $file"
            fi
            counter=counter+1
        fi
    done

    if [ ! -z "$line" ]; then
        echo "$counter * $line"
    fi
    ) |
    sort -k3,3 -k1,1 -brn
  fi
else
  cat
fi
