Tente
ffmpeg \
-loop 1 -t 6 -i ok/image-1.jpg \
-loop 1 -t 5 -i ok/image-2.jpg \
-loop 1 -t 5 -i ok/image-3.jpg \
-loop 1 -t 5 -i ok/image-4.jpg \
-loop 1 -t 5 -i ok/image-5.jpg \
-filter_complex \
"[0:v]drawtext=fontfile=/Library/Fonts/Arial.ttf:text='FIRST TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th,drawtext=fontfile=/Library/Fonts/Arial.ttf:text='AFTER FIRST TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th+100,split[pre][pbv0];[pbv0]fifo[bv0]; \
[pre]fade=t=in:st=0:d=1[v0]; \
[1:v]drawtext=fontfile=/Library/Fonts/Arial.ttf:text='SECOND TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th,split=3[pbv1a][pbv1b][v1];[pbv1a]fifo[bv1a];[pbv1b]fifo[bv1b]; \
[2:v]drawtext=fontfile=/Library/Fonts/Arial.ttf:text='THIRD TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th,split=3[pbv2a][pbv2b][v2];[pbv2a]fifo[bv2a];[pbv2b]fifo[bv2b];\
[3:v]drawtext=fontfile=/Library/Fonts/Arial.ttf:text='FOURTH TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th,split=3[pbv3a][pbv3b][v3];[pbv3a]fifo[bv3a];[pbv3b]fifo[bv3b]; \
[4:v]drawtext=fontfile=/Library/Fonts/Arial.ttf:text='FIFTH TEXT':fontcolor=white:fontsize=24:x=(w-tw)/2:y=(h/PHI)+th,split[pbv4][v4];[pbv4]fifo[bv4]; \
[bv1a][bv0]blend=all_expr='A*T/0.5+B*(0.5-T)/0.5',trim=0:0.5[01v]; \
[bv2a][bv1b]blend=all_expr='A*T/0.5+B*(0.5-T)/0.5',trim=0:0.5[12v]; \
[bv3a][bv2b]blend=all_expr='A*T/0.5+B*(0.5-T)/0.5',trim=0:0.5[23v]; \
[bv4][bv3b]blend=all_expr='A*T/0.5+B*(0.5-T)/0.5',trim=0:0.5[34v]; \
[v0][01v][v1][12v][v2][23v][v3][34v][v4]concat=n=9,format=yuv420p[v]" -map "[v]" out.mp4
Eu assumi que você deseja que o texto esteja presente durante o cross-fade. Então, eu refiz o seu comando. Todos os textos devem ser aplicados primeiro e essas saídas devem ser combinadas.