#!/bin/sh

# Copyright 2022,2023 Loïc Cerf (lcerf@dcc.ufmg.br)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.

# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.

if [ -z "$1" -o "$1" = "-h" -o "$1" = "--help" ]
then
    printf "Usage: $0 max_char_per_line [file.srt]...

Approximately-evenly break too-long lines in .srt subtitles into lines
of at most max_char_per_line (> 1) characters (except for single
words), preferably on punctuation, without letting orphan words, and
using larger and larger lines.  Always have at most two lines on
screen, splitting the cues proportionally to the number of characters.
"
    exit
fi

max=$1
shift

# After sed possibly substituting some spaces for '~', this script
# consists of a sequence of four AWK programs, all reading .srt
# subtitles on the standard input and writing .srt subtitles on the
# standard output.  One can comment some of those AWK programs to
# study the behavior of the remaining ones.

# The 1st and 3rd AWK programs greedily break the text lines in every
# subtitle (from its end to its beginning), keeping the cues as they
# are.  The 1st AWK program breaks on punctuation.  The 3rd AWK
# program breaks on spaces.  The help message states the objectives.

# The 2nd and 4th AWK programs keep the text lines as they are but
# group them by at most two and split the cues proportionally to the
# number of characters.  The 2nd AWK program outputs two-line
# subtitles only if each line has at most the specified maximal number
# of characters. The 4th AWK program always groups lines two by two.

sed '
s/\b\([nd][eao]\) /\1~/g
s/\b\([aeiouyAEIOUY]\) /\1~/g' "$@" | awk -v max=$max '
function after_last_punct() {
    if (NF > 3) {
        next_NF = NF - 1
        for (after = $next_NF " " $NF; --next_NF != 1 && $next_NF !~ /[[:punct:]]$/; after = $next_NF " " after);
        if (next_NF != 1)
            return after }
    next_NF = 0
    return $0 }

function after_intermediary_punct() {
    if (NF > 2) {
        next_NF = NF
        for (after = $NF; --next_NF != 1 && $next_NF !~ /[[:punct:]]$/; after = $next_NF " " after);
        if (next_NF != 1)
            return after }
    next_NF = 0
    return $0 }

/^ *[0-9:,.]* *-->/ || NF < 2 {
    print
    next }

{
    for (out = ""; NF; ) {
        if (length <= max) {
            printf "%s", $0 "\n" out
            next }
        l = length(after_last_punct())
        out = after_last_punct() "\n" out
        soft_min = (length * (1 - 1 / max) + 1) / max
        if (soft_min != int(soft_min))
            soft_min = int(soft_min) + 1
        soft_min = (length + 1) / soft_min - 1
        NF = next_NF
        if (l <= soft_min) {
            for (l += length(after_intermediary_punct()); NF > 2 && ++l <= soft_min; l += length(after_intermediary_punct())) {
                out = after_intermediary_punct() " " out
                NF = next_NF }
            if (NF > 2 && l <= max) {
                out = after_intermediary_punct() " " out
                NF = next_NF } } }
    printf "%s", out }' | awk -F \\n -v RS='' -v max=$max '
function to_sec(t) {
    n = split(t, hms, /:/)
    sub(/,/, ".", hms[n])
    return hms[n] + 60 * hms[--n] + 3600 * hms[--n] }

function print_time() {
    h = int(time / 3600)
    m = int((time - 3600 * h) / 60)
    s = sprintf("%02.3f", time - 3600 * h - 60 * m)
    sub(/\./, ",", s)
    printf "%02d:%02d:%s", h, m, s }

function print_cue(duration) {
    print ++nb
    print_time()
    printf " --> "
    time += duration
    print_time() }

{
    for (; $NF == ""; --NF);
    split($2, interval, /-->/)
    time = to_sec(interval[1])
    duration = (to_sec(interval[2]) - time) / (length - length($1) - length($2) - 1)
    for (i = 3; i < NF; ++i)
        if (length($i) > max) {
            print_cue((length($i) + 1) * duration)
            print "\n" $i "\n" }
        else
            if (length($(i + 1)) > max) {
                print_cue((length($i) + 1) * duration)
                print "\n" $i "\n"
                print_cue((length($++i) + 1) * duration)
                print "\n" $i "\n" }
            else {
                print_cue((length($i) + length($(i + 1)) + 2) * duration)
                print "\n" $i "\n" $++i "\n" } }

i == NF {
    print_cue((length($i) + 1) * duration)
    print "\n" $i "\n" }' | awk -v max=$max '
/^ *[0-9:,.]* *-->/ || NF < 2 {
    print
    next }

{
    for (out = ""; NF; ) {
        if (length <= max) {
            printf "%s", $0 "\n" out
            next }
        l = length($(NF - 1)) + length($NF)
        if (++l < max && NF > 1) {
            out = $(NF - 1) " " $NF "\n" out
            soft_min = (length * (1 - 1 / max) + 1) / max
            if (soft_min != int(soft_min))
                soft_min = int(soft_min) + 1
            soft_min = (length + 1) / soft_min - 1
            NF -= 2
            if (l <= soft_min) {
                for (l += length($NF); NF > 2 && ++l <= soft_min; l += length($--NF))
                    out = $NF " " out
                if (NF > 2 && l <= max) {
                    out = $NF " " out
                    --NF } } }
        else {
            out = $NF "\n" out
            --NF } }
    printf "%s", out }' | awk -F \\n -v RS='' '
function to_sec(t) {
    n = split(t, hms, /:/)
    sub(/,/, ".", hms[n])
    return hms[n] + 60 * hms[--n] + 3600 * hms[--n] }

function print_time() {
    h = int(time / 3600)
    m = int((time - 3600 * h) / 60)
    s = sprintf("%02.3f", time - 3600 * h - 60 * m)
    sub(/\./, ",", s)
    printf "%02d:%02d:%s", h, m, s }

function print_cue(duration) {
    print ++nb
    print_time()
    printf " --> "
    time += duration
    print_time() }

{
    for (; $NF == ""; --NF);
    split($2, interval, /-->/)
    time = to_sec(interval[1])
    duration = (to_sec(interval[2]) - time) / (length - length($1) - length($2) - 1)
    for (i = 3; i < NF; ++i) {
        print_cue((length($i) + length($(i + 1)) + 2) * duration)
        print "\n" $i "\n" $++i "\n" } }

i == NF {
    print_cue((length($i) + 1) * duration)
    print "\n" $i "\n" }' | sed '
s/\b\([nd][eao]\)~/\1 /g
s/\b\([aeiouyAEIOUY]\)~/\1 /g'