#!/usr/bin/python
#
# imgsizer -- correct image sizes in WWW pages
# by Eric S. Raymond <esr@thyrsus.com>
#
# Fix up IMG tags in given documents to contain correct sizes.
#
# This code runs under either Python 2 or Python 3. Keep it that way!
#
# SPDX-License-Identifier: BSD-2-clause
#
from __future__ import print_function

# Changelog:
#
# Originally created by Eric S. Raymond <esr@thyrsus.com> 30 Jul 1996
#
# Modified by Erik Rossen <rossen@planet.ch> 15 May 1999
#
#    Added the --nomagick switch, to use file(1) and rdjpgcom(1)
#    to determine the image size instead of identify(1) from the
#    ImageMagick suite.
#
# Modified by Michael C. Toren <michael@toren.net> 18 Aug 2000
#
#    Fixed bug where the SRC attribute's value needed to be in quotes,
#    improved command line parsing (but it could still use some work),
#    added -q switch to omit quotes when generating tags, and -l switch
#    to generate lowercase tags.  -mct
#
# Modified by Michael C. Toren <michael@toren.net> 19 Aug 2000
#
#    Improved the command line parsing some more, now looks for additional
#    arguments via an IMGSIZER environmental variable, added the -d switch
#    to set the DocumentRoot, -v switch to display version information,
#    and -h switch to display usage information.  -mct
#
# Modified by Michael C. Toren <michael@toren.net> 23 Feb 2001
#
#    Fixed two bugs reported by Jeroen Valcke <jeroen@valcke.com>, one
#    where the -d switch did not function properly if the img src attribute
#    was quoted, and another where the &error sub was incorrectly reporting
#    the line number an error occurred due to the input record separator
#    being set to ">".
#
# Rewritten in Python by Eric S. Raymond <esr@thyrsus.com> 11 July 2001
#
#    Time to get rid of the dependency on httpget.  The -l option is gone, too;
#    instead, we deduce the right case by looking at the leading tag.  -q
#    is gone; we always emit without quotes.  -m is gone too, instead we
#    try commands in least to most expensive order, and notice when a command
#    returns not to try it again.
#
# Fixes by ESR, 29 July 2001
#
#    Incorporated fixes by Peter S. Galbraith.
#
# Fixes by ESR, 25 April 2003
#
#    Merged amended versions of Lennart Poettering's fix for Debian bug 139714.
#    and Jeroen N. Witmond's fix for Debian bug 168964.  Added regression-test
#    production.
#
# Enhancement by ESR, 14 Nov 2003
#
#    Verify and merge Lucien Saviot's patch to produce XHTML from XHTML input.
#    Also his change to handle spurious lin e breaks produced by Dave Raggett's
#    tidy(1) utility.
#
# Modified by Andrew Gwozdziewycz <gwozdzie@lucas.cis.temple.edu>, 17 June 2004
#
#    Added support for the Python Imaging Library to determine size in case of
#    failure from file(1), rdjpgcom(1) and identify(1).
#
# SPDX-License-Identifier: BSD-2-Clause

import sys, os, getopt, string, re, filecmp

# Warning: In some Python 3 versions getstatusoutput() returns
# status incorrectly so that a nonzero exit looks like the subprocess
# was signaled!  (Observed under 3.4.3; Debian bug #764848)
try:
    from subprocess import getstatusoutput
except ImportError:
    from commands import getstatusoutput

try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

version = "2.10"

splash = """imgsizer version %s, Eric S. Raymond <esr@thyrsus.com>
See <http://www.catb.org/~esr/software.html> for updates."""

usage = """Usage: imgsizer [OPTIONS] [HTML File]

Options:

    -V, --version

        Display version information and exit.

    -h, --help

        Display usage information.

    -d <directory>, --document-root <directory>

        Directory where absolute image filenames (i.e, ones which contain
        a leading "/") may be found.

    -n, --no-overwrite

        Don't overwrite existing width and height tags if both are present.

"""

# Optimization latches -- if an attempt  to invoke a command returns 127
# "not found" there will turn off and that command won't be tried again.
magick = 1	# using ImageMagick by default
rdjpgcom = 1	# using rdjpcom by default
pythonimage = 1 # use python imaging library

def attrformat(xc, dim):
    "Formst an attrubute"
    if lower:
        res = " " + dim
    else:
        res = " " + dim.upper()
    res = res + '="' + str(xc) + '"'
    return res

def sizefix(infp, outfp):
# Apply attrfix to the attributes in each image tag
    global lower
    while 1:
        ch = infp.read(1)
        if ch == '':
            return
        outfp.write(ch)
        if ch == '<':
            # within an HTML tag
            lead = infp.read(2)
            outfp.write(lead)
            if not lead in ("im", "IM"):
                continue
            # splitting the read this way copes with single-char tags like <b>
            lead = lead + infp.read(1)
            outfp.write(lead[-1])
            if not lead in ("img", "IMG"):
                continue
            # within an image tag
            lower = (lead == 'img')
            attributes = ""
            while 1:
                ch = infp.read(1)
                if ch == '':
                    return
                if ch == '>':
                    break
                if ch == '/':
                    ch2 = infp.read(1)
                    ch = ch + ch2
                    if ch2 == '>':
                        break
                attributes = attributes + ch
            outfp.write(transform(attributes) + ch)

x_match = re.compile(r" ([0-9]+) *x *([0-9]+)")
rdjpg_match = re.compile(r" ([0-9]+)w *\* *([0-9]+)h")

def imgsize(src):
    "Return the image size in pixels for a given image source."
    global magick, rdjpgcom, pythonimage
    try:
        if not ":" in src:
            src = "file:" + src
        (filename, _headers) = urlretrieve(src)
    except IOError:
        return None
    # Now let's see if we can get a size for the retrieved image.
    # Try file(1) first -- cheapest, as it doesn't read the whole image
    (status, output) = getstatusoutput("file " + filename)
    if status == 0:
        # file(1) works for every common image format other than JPEG
        if output.find("JPEG") == -1:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif rdjpgcom:
            # Use rdjpgcom(1) to handle JPEGs
            (status, output) = getstatusoutput("rdjpgcom -verbose " + filename)
            sizes = rdjpg_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
            elif status == 127:
                rdjpgcom = 0
    # Next try identify(1), more expensive but bulletproof
    if magick:
        (status, output) = getstatusoutput("identify " + filename)
        if status == 0:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif status == 127:
            sys.stderr.write("imgsizer: giving up on ImageMagick\n")
            magick = 0
    # if that fails, try at _LAST_ resort Python Imaging Library
    # open doesn't actually load all the data, so it shouldn't be too expensive
    if pythonimage:
        try:
            import Image
            pyimg = Image.open(filename)
            return pyimg.size
        except (ImportError, IOError):
            sys.stderr.write("imgsizer: giving up on Python Imaging Library\n")
            pythonimage = 0

    # All attempts failed
    sys.stderr.write("imgsizer: couldn't analyze %s\n" % src)

source  = re.compile(r'SRC\s*=\s*"?([^" \t\n]*)"?', re.I)
awidth  = re.compile(r' *WIDTH\s*=\s*"?[0-9]*"?', re.I)
aheight = re.compile(r' *HEIGHT\s*=\s*"?[0-9]*"?', re.I)
pwidth  = re.compile(r'WIDTH\s*=\s*"?[0-9]*%"?', re.I)
pheight = re.compile(r'HEIGHT\s*=\s*"?[0-9]*%"?', re.I)

def transform(attr):
    src = source.search(attr)
    # Must have a source part and no percents in existing width or height
    if not src or pwidth.search(attr) or pheight.search(attr):
        return attr
    if no_overwrite and awidth.search(attr) and aheight.search(attr):
        return attr
    # Correct the url for documentation root, if present
    url = src.group(1)
    if url[0] == '/' and root:
        url = os.path.join(root, url[1:])
    # OK, get the size tuple if possible
    dimensions = imgsize(url)
    if not dimensions:
        return attr
    else:
        # Nuke any old size attr
        if not no_overwrite:
            attr = re.sub(awidth, "", attr)
            attr = re.sub(aheight, "", attr)
        # Compute image dimensions
        (xc, yc) = dimensions
        # Plug in the new attr
        return attr + attrformat(xc, "width") + attrformat(yc, "height")

# Output lowercase tags by default.
lower = 1

# Set the default DocumentRoot to the current working directory.
root = "."

out = "imgsizer-out$$"
mydir = "."	# NOTE: if you are doing <yourfile make sure that pwd is correct!

# Collect options from the environment first, then the command line
options = os.environ.get("IMGSIZER")
if options:
    options = string.split(options)
else:
    options = []
options = options + sys.argv[1:]

# Process options
(options, arguments) = getopt.getopt(options, "Vhd:n", \
			     ('version', 'help', 'usage', 'document=', 'no-overwrite'))
no_overwrite = 0
for (switch, val) in options:
    if switch in ('-V', '--version'):
        print(splash % version)
        raise SystemExit
    elif switch in ('-h', '--help', '--usage'):
        print(splash + "\n\n" + usage)
        raise SystemExit
    elif switch in ('-d', '--document'):
        root = val
        if not os.path.isdir(root):
            print("Document root isn't a directory")
            raise SystemExit(1)
    elif switch in ('-n', '--no-overwrite'):
        no_overwrite = 1

if not arguments:
    sizefix(sys.stdin, sys.stdout)
else:
    for myfile in arguments:
        try:
            infp = open(myfile)
        except:
            print("imgsizer: can't open input file", myfile)
            raise SystemExit(1)
        tempfile = myfile + ".~imgsizer-%d~" % os.getpid()
        try:
            outfp = open(tempfile, "w")
        except OSError:
            print("imgsizer: can't open tempfile")
            raise SystemExit(1)
        sizefix(infp, outfp)
        if filecmp.cmp(myfile, tempfile):
            os.remove(tempfile)
        else:
            try:
                os.rename(tempfile, myfile)
            except OSError:
                sys.stderr.write("imgsize: couldn't replace " + myfile)
                os.remove(tempfile)

# End
