From 424f04015872323579460bdebb4d75e4b219a8fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Sol=C3=ADs?= Date: Mon, 2 Jun 2025 14:21:59 +0000 Subject: [PATCH] feat: Add the ability to split CSV and DB generation --- copy.sh | 3 ++ youtube-download-channel.sh | 92 +++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/copy.sh b/copy.sh index 0e221fd..d4dbb3a 100755 --- a/copy.sh +++ b/copy.sh @@ -1,4 +1,7 @@ #!/bin/bash +#Via https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script +folder=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +cd "${folder}" || exit for i in ./*.sh; do shfmt -w "${i}" shellcheck -o all -e SC2312 -f diff "${i}" | patch -p1 diff --git a/youtube-download-channel.sh b/youtube-download-channel.sh index d027fd3..de4d715 100755 --- a/youtube-download-channel.sh +++ b/youtube-download-channel.sh @@ -6,10 +6,13 @@ channel=${1:-"subscriptions"} breaktime=${2:-"today-1month"} #3rd parameter: Seconds between data requests. Decrease to make downloads faster, but your account may be temporarily blocked if you use a number too low. sleeptime=${3:-"1.0"} +#4th parameter: Whether to enable exporting to FreeTube playlist database (1=on by default, 0=off) +enabledb=${4:-"1"} +#5th parameter: Whether to enable exporting to a CSV file (1=on by default, 0=off) +enablecsv=${5:-"1"} #Internal variables: #Via https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script folder=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -echo "${folder}" #Required to download your own subscriptions. #Obtain this file through the procedure listed at # https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp @@ -32,6 +35,9 @@ if [[ ! -d "${subfolder}" ]]; then mkdir -v "${subfolder}" fi cd "${subfolder}" || exit +if [[ ! -f "${archive}" ]]; then + touch "${archive}" +fi if [[ -f "${channel}.tar.zst" ]]; then tar -xvp -I zstd -f "${channel}.tar.zst" fi @@ -41,7 +47,7 @@ url="https://www.youtube.com/@${channel}" if [[ "${channel}" = "subscriptions" ]]; then url="https://www.youtube.com/feed/subscriptions" fi -if [[ -z "${cookies}" && ${channel} = "subscriptions" ]]; then +if [[ -z "${cookies}" && "${channel}" = "subscriptions" ]]; then "${python}" "${ytdl}" "${url}" \ --skip-download --download-archive "${archive}" \ --dateafter "${breaktime}" \ @@ -58,42 +64,68 @@ else --sleep-requests "${sleeptime}" fi rm -rf "${csv}" -#ls -t | grep -e ".info.json" | while read -r x; do +if [[ ! -f "${sortcsv}" ]]; then + touch "${sortcsv}" +fi find . -type f -iname "*.info.json" -exec ls -t {} + | while read -r xp; do x="${xp##./}" - #echo youtube $(jq -c '.id' "${x}" | sed -e "s/\"//g") | tee -a "${archive}" & echo "youtube $(jq -cr '.id' "${x}")" | tee -a "${archive}" & - jq -c '[.upload_date, .timestamp, .uploader , .title, .webpage_url]' "${subfolder}/${x}" | while read -r i; do - echo "${i}" | sed -e "s/^\[//g" -e "s/\]$//g" -e "s/\\\\\"/"/g" | tee -a "${csv}" & - done - jq -c '[.upload_date, .timestamp]' "${subfolder}/${x}" | while read -r i; do - echo "${i},${x}" | sed -e "s/^\[//g" -e "s/\],/,/g" -e "s/\\\\\"/"/g" | tee -a "${sortcsv}" & - done + if [[ ${enablecsv} = "1" ]]; then + jq -c '[.upload_date, .timestamp, .uploader , .title, .webpage_url]' "${subfolder}/${x}" | while read -r i; do + echo "${i}" | sed -e "s/^\[//g" -e "s/\]$//g" -e "s/\\\\\"/"/g" | tee -a "${csv}" & + done + fi + if [[ ${enablecsv} = "1" || ${enabledb} = "1" ]]; then + jq -c '[.upload_date, .timestamp]' "${subfolder}/${x}" | while read -r i; do + echo "${i},${x}" | sed -e "s/^\[//g" -e "s/\],/,/g" -e "s/\\\\\"/"/g" | tee -a "${sortcsv}" & + done + fi if [[ $(jobs -r -p | wc -l) -ge $(($(getconf _NPROCESSORS_ONLN) * 3 * 2)) ]]; then wait -n fi done wait -sort "${sortcsv}" | uniq >"/tmp/${channel}-sort-ordered.csv" -echo "{\"playlistName\":\"${channel}\",\"protected\":false,\"description\":\"Videos to watch later\",\"videos\":[" >"/tmp/${channel}.db" -#cat "/tmp/${channel}-sort-ordered.csv" | while read -r line; do -while read -r line; do - file=$(echo "${line}" | cut -d ',' -f3-) - echo "${file}" - jq -c "{\"videoId\": .id, \"title\": .title, \"author\": .uploader, \"authorId\": .channel_id, \ -\"lengthSeconds\": .duration, \"published\": ( .timestamp * 1000 ) , \"timeAdded\": $(date +%s)$(date +%N | cut -c-3), \ -\"playlistItemId\": \"$(cat /proc/sys/kernel/random/uuid)\", \"type\": \"video\"}" "${subfolder}/${file}" | tee -a "/tmp/${channel}.db" - echo "," >>"/tmp/${channel}.db" -done <"/tmp/${channel}-sort-ordered.csv" -echo "],\"_id\":\"${channel}\",\"createdAt\":$(date +%s),\"lastUpdatedAt\":$(date +%s)}" >>"/tmp/${channel}.db" -rm "${json}" -grep -v -e ":[ ]*null" "/tmp/${channel}.db" | tr '\n' '\r' | sed -e "s/,\r[,\r]*/,\r/g" | sed -e "s/,\r\]/\]/g" | tr '\r' '\n' | jq -c . >"${json}" && rm "/tmp/${channel}.db" -rm "/tmp/${channel}-sort-ordered.csv" "${sortcsv}" -sort "${csv}" | uniq >"/tmp/${channel}-without-header.csv" -echo '"Upload Date", "Timestamp", "Uploader", "Title", "Webpage URL"' >"/tmp/${channel}.csv" -cat "/tmp/${channel}-without-header.csv" >>"/tmp/${channel}.csv" -mv "/tmp/${channel}.csv" "${csv}" -rm "/tmp/${channel}-without-header.csv" +if [[ ${enablecsv} = "1" || ${enabledb} = "1" ]]; then + sort "${sortcsv}" | uniq >"/tmp/${channel}-sort-ordered.csv" +fi +if [[ ${enabledb} = "1" ]]; then + rm "/tmp/${channel}.db" + echo "{\"playlistName\":\"${channel}\",\"protected\":false,\"description\":\"Videos to watch later\",\"videos\":[" >"/tmp/${channel}.db" +fi +if [[ ${enablecsv} = "1" || ${enabledb} = "1" ]]; then + while read -r line; do + file=$(echo "${line}" | cut -d ',' -f3-) + echo "${file}" + if [[ "${breaktime}" =~ ^[0-9]+$ ]]; then + uploaddate=$(echo "${line}" | cut -d ',' -f1 | sed -e "s/\"//g") + if [[ "${uploaddate}" -lt "${breaktime}" ]]; then + echo "Video ${file} uploaded on ${uploaddate}, removing..." + rm "${file}" + fi + fi + if [[ ${enabledb} = "1" ]]; then + if [[ -f "${file}" ]]; then + jq -c "{\"videoId\": .id, \"title\": .title, \"author\": .uploader, \"authorId\": .channel_id, \"lengthSeconds\": .duration, \"published\": ( .timestamp * 1000 ), \"timeAdded\": $(date +%s)$(date +%N | cut -c-3), \"playlistItemId\": \"$(cat /proc/sys/kernel/random/uuid)\", \"type\": \"video\"}" "${subfolder}/${file}" | tee -a "/tmp/${channel}.db" + echo "," >>"/tmp/${channel}.db" + fi + fi + done <"/tmp/${channel}-sort-ordered.csv" +fi +if [[ ${enabledb} = "1" ]]; then + echo "],\"_id\":\"${channel}\",\"createdAt\":$(date +%s),\"lastUpdatedAt\":$(date +%s)}" >>"/tmp/${channel}.db" + rm "${json}" + grep -v -e ":[ ]*null" "/tmp/${channel}.db" | tr '\n' '\r' | sed -e "s/,\r[,\r]*/,\r/g" | sed -e "s/,\r\]/\]/g" -e "s/\[\r,/\[/g" | tr '\r' '\n' | jq -c . >"${json}" && rm "/tmp/${channel}.db" +fi +if [[ ${enablecsv} = "1" || ${enabledb} = "1" ]]; then + rm "/tmp/${channel}-sort-ordered.csv" "${sortcsv}" +fi +if [[ ${enablecsv} = "1" ]]; then + sort "${csv}" | uniq >"/tmp/${channel}-without-header.csv" + echo '"Upload Date", "Timestamp", "Uploader", "Title", "Webpage URL"' >"/tmp/${channel}.csv" + cat "/tmp/${channel}-without-header.csv" >>"/tmp/${channel}.csv" + mv "/tmp/${channel}.csv" "${csv}" + rm "/tmp/${channel}-without-header.csv" +fi sort "${archive}" | uniq >"/tmp/${channel}.txt" mv "/tmp/${channel}.txt" "${archive}" tar -cvp -I zstd -f "${channel}.tar.zst" ./*.info.json && rm ./*.info.json