#!/bin/sh # Copyright 2022,2023 Loïc Cerf (lcerf@dcc.ufmg.br) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or (at # your option) any later version. # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. if [ -z "$1" -o "$1" = "-h" -o "$1" = "--help" ] then printf "Usage: $0 max_char_per_line [file.srt]... Approximately-evenly break too-long lines in .srt subtitles into lines of at most max_char_per_line (> 1) characters (except for single words), preferably on punctuation, without letting orphan words, and using larger and larger lines. Always have at most two lines on screen, splitting the cues proportionally to the number of characters. " exit fi max=$1 shift # After sed possibly substituting some spaces for '~', this script # consists of a sequence of four AWK programs, all reading .srt # subtitles on the standard input and writing .srt subtitles on the # standard output. One can comment some of those AWK programs to # study the behavior of the remaining ones. # The 1st and 3rd AWK programs greedily break the text lines in every # subtitle (from its end to its beginning), keeping the cues as they # are. The 1st AWK program breaks on punctuation. The 3rd AWK # program breaks on spaces. The help message states the objectives. # The 2nd and 4th AWK programs keep the text lines as they are but # group them by at most two and split the cues proportionally to the # number of characters. The 2nd AWK program outputs two-line # subtitles only if each line has at most the specified maximal number # of characters. The 4th AWK program always groups lines two by two. sed ' s/\b\([nd][eao]\) /\1~/g s/\b\([aeiouyAEIOUY]\) /\1~/g' "$@" | awk -v max=$max ' function after_last_punct() { if (NF > 3) { next_NF = NF - 1 for (after = $next_NF " " $NF; --next_NF != 1 && $next_NF !~ /[[:punct:]]$/; after = $next_NF " " after); if (next_NF != 1) return after } next_NF = 0 return $0 } function after_intermediary_punct() { if (NF > 2) { next_NF = NF for (after = $NF; --next_NF != 1 && $next_NF !~ /[[:punct:]]$/; after = $next_NF " " after); if (next_NF != 1) return after } next_NF = 0 return $0 } /^ *[0-9:,.]* *-->/ || NF < 2 { print next } { for (out = ""; NF; ) { if (length <= max) { printf "%s", $0 "\n" out next } l = length(after_last_punct()) out = after_last_punct() "\n" out soft_min = (length * (1 - 1 / max) + 1) / max if (soft_min != int(soft_min)) soft_min = int(soft_min) + 1 soft_min = (length + 1) / soft_min - 1 NF = next_NF if (l <= soft_min) { for (l += length(after_intermediary_punct()); NF > 2 && ++l <= soft_min; l += length(after_intermediary_punct())) { out = after_intermediary_punct() " " out NF = next_NF } if (NF > 2 && l <= max) { out = after_intermediary_punct() " " out NF = next_NF } } } printf "%s", out }' | awk -F \\n -v RS='' -v max=$max ' function to_sec(t) { n = split(t, hms, /:/) sub(/,/, ".", hms[n]) return hms[n] + 60 * hms[--n] + 3600 * hms[--n] } function print_time() { h = int(time / 3600) m = int((time - 3600 * h) / 60) s = sprintf("%02.3f", time - 3600 * h - 60 * m) sub(/\./, ",", s) printf "%02d:%02d:%s", h, m, s } function print_cue(duration) { print ++nb print_time() printf " --> " time += duration print_time() } { for (; $NF == ""; --NF); split($2, interval, /-->/) time = to_sec(interval[1]) duration = (to_sec(interval[2]) - time) / (length - length($1) - length($2) - 1) for (i = 3; i < NF; ++i) if (length($i) > max) { print_cue((length($i) + 1) * duration) print "\n" $i "\n" } else if (length($(i + 1)) > max) { print_cue((length($i) + 1) * duration) print "\n" $i "\n" print_cue((length($++i) + 1) * duration) print "\n" $i "\n" } else { print_cue((length($i) + length($(i + 1)) + 2) * duration) print "\n" $i "\n" $++i "\n" } } i == NF { print_cue((length($i) + 1) * duration) print "\n" $i "\n" }' | awk -v max=$max ' /^ *[0-9:,.]* *-->/ || NF < 2 { print next } { for (out = ""; NF; ) { if (length <= max) { printf "%s", $0 "\n" out next } l = length($(NF - 1)) + length($NF) if (++l < max && NF > 1) { out = $(NF - 1) " " $NF "\n" out soft_min = (length * (1 - 1 / max) + 1) / max if (soft_min != int(soft_min)) soft_min = int(soft_min) + 1 soft_min = (length + 1) / soft_min - 1 NF -= 2 if (l <= soft_min) { for (l += length($NF); NF > 2 && ++l <= soft_min; l += length($--NF)) out = $NF " " out if (NF > 2 && l <= max) { out = $NF " " out --NF } } } else { out = $NF "\n" out --NF } } printf "%s", out }' | awk -F \\n -v RS='' ' function to_sec(t) { n = split(t, hms, /:/) sub(/,/, ".", hms[n]) return hms[n] + 60 * hms[--n] + 3600 * hms[--n] } function print_time() { h = int(time / 3600) m = int((time - 3600 * h) / 60) s = sprintf("%02.3f", time - 3600 * h - 60 * m) sub(/\./, ",", s) printf "%02d:%02d:%s", h, m, s } function print_cue(duration) { print ++nb print_time() printf " --> " time += duration print_time() } { for (; $NF == ""; --NF); split($2, interval, /-->/) time = to_sec(interval[1]) duration = (to_sec(interval[2]) - time) / (length - length($1) - length($2) - 1) for (i = 3; i < NF; ++i) { print_cue((length($i) + length($(i + 1)) + 2) * duration) print "\n" $i "\n" $++i "\n" } } i == NF { print_cue((length($i) + 1) * duration) print "\n" $i "\n" }' | sed ' s/\b\([nd][eao]\)~/\1 /g s/\b\([aeiouyAEIOUY]\)~/\1 /g'