text.lua 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. -- https://en.wikipedia.org/wiki/Unicode_block
  2. ---@alias CodePointRange {[1]: integer; [2]: integer}
  3. ---@type CodePointRange[]
  4. local zero_width_blocks = {
  5. {0x0000, 0x001F}, -- C0
  6. {0x007F, 0x009F}, -- Delete + C1
  7. {0x034F, 0x034F}, -- combining grapheme joiner
  8. {0x061C, 0x061C}, -- Arabic Letter Strong
  9. {0x200B, 0x200F}, -- {zero-width space, zero-width non-joiner, zero-width joiner, left-to-right mark, right-to-left mark}
  10. {0x2028, 0x202E}, -- {line separator, paragraph separator, Left-to-Right Embedding, Right-to-Left Embedding, Pop Directional Format, Left-to-Right Override, Right-to-Left Override}
  11. {0x2060, 0x2060}, -- word joiner
  12. {0x2066, 0x2069}, -- {Left-to-Right Isolate, Right-to-Left Isolate, First Strong Isolate, Pop Directional Isolate}
  13. {0xFEFF, 0xFEFF}, -- zero-width non-breaking space
  14. -- Some other characters can also be combined https://en.wikipedia.org/wiki/Combining_character
  15. {0x0300, 0x036F}, -- Combining Diacritical Marks 0 BMP Inherited
  16. {0x1AB0, 0x1AFF}, -- Combining Diacritical Marks Extended 0 BMP Inherited
  17. {0x1DC0, 0x1DFF}, -- Combining Diacritical Marks Supplement 0 BMP Inherited
  18. {0x20D0, 0x20FF}, -- Combining Diacritical Marks for Symbols 0 BMP Inherited
  19. {0xFE20, 0xFE2F}, -- Combining Half Marks 0 BMP Cyrillic (2 characters), Inherited (14 characters)
  20. -- Egyptian Hieroglyph Format Controls and Shorthand format Controls
  21. {0x13430, 0x1345F}, -- Egyptian Hieroglyph Format Controls 1 SMP Egyptian Hieroglyphs
  22. {0x1BCA0, 0x1BCAF}, -- Shorthand Format Controls 1 SMP Common
  23. -- not sure how to deal with those https://en.wikipedia.org/wiki/Spacing_Modifier_Letters
  24. {0x02B0, 0x02FF}, -- Spacing Modifier Letters 0 BMP Bopomofo (2 characters), Latin (14 characters), Common (64 characters)
  25. }
  26. -- All characters have the same width as the first one
  27. ---@type CodePointRange[]
  28. local same_width_blocks = {
  29. {0x3400, 0x4DBF}, -- CJK Unified Ideographs Extension A 0 BMP Han
  30. {0x4E00, 0x9FFF}, -- CJK Unified Ideographs 0 BMP Han
  31. {0x20000, 0x2A6DF}, -- CJK Unified Ideographs Extension B 2 SIP Han
  32. {0x2A700, 0x2B73F}, -- CJK Unified Ideographs Extension C 2 SIP Han
  33. {0x2B740, 0x2B81F}, -- CJK Unified Ideographs Extension D 2 SIP Han
  34. {0x2B820, 0x2CEAF}, -- CJK Unified Ideographs Extension E 2 SIP Han
  35. {0x2CEB0, 0x2EBEF}, -- CJK Unified Ideographs Extension F 2 SIP Han
  36. {0x2F800, 0x2FA1F}, -- CJK Compatibility Ideographs Supplement 2 SIP Han
  37. {0x30000, 0x3134F}, -- CJK Unified Ideographs Extension G 3 TIP Han
  38. {0x31350, 0x323AF}, -- CJK Unified Ideographs Extension H 3 TIP Han
  39. }
  40. local width_length_ratio = 0.5
  41. ---@type integer, integer
  42. local osd_width, osd_height = 100, 100
  43. ---Get byte count of utf-8 character at index i in str
  44. ---@param str string
  45. ---@param i integer?
  46. ---@return integer
  47. local function utf8_char_bytes(str, i)
  48. local char_byte = str:byte(i)
  49. local max_bytes = #str - i + 1
  50. if char_byte < 0xC0 then
  51. return math.min(max_bytes, 1)
  52. elseif char_byte < 0xE0 then
  53. return math.min(max_bytes, 2)
  54. elseif char_byte < 0xF0 then
  55. return math.min(max_bytes, 3)
  56. elseif char_byte < 0xF8 then
  57. return math.min(max_bytes, 4)
  58. else
  59. return math.min(max_bytes, 1)
  60. end
  61. end
  62. ---Creates an iterator for an utf-8 encoded string
  63. ---Iterates over utf-8 characters instead of bytes
  64. ---@param str string
  65. ---@return fun(): integer?, string?
  66. function utf8_iter(str)
  67. local byte_start = 1
  68. return function()
  69. local start = byte_start
  70. if #str < start then return nil end
  71. local byte_count = utf8_char_bytes(str, start)
  72. byte_start = start + byte_count
  73. return start, str:sub(start, start + byte_count - 1)
  74. end
  75. end
  76. ---Estimating string length based on the number of characters
  77. ---@param char string
  78. ---@return number
  79. function utf8_length(str)
  80. local str_length = 0
  81. for _, c in utf8_iter(str) do
  82. str_length = str_length + 1
  83. end
  84. return str_length
  85. end
  86. ---Extract Unicode code point from utf-8 character at index i in str
  87. ---@param str string
  88. ---@param i integer
  89. ---@return integer
  90. local function utf8_to_unicode(str, i)
  91. local byte_count = utf8_char_bytes(str, i)
  92. local char_byte = str:byte(i)
  93. local unicode = char_byte
  94. if byte_count ~= 1 then
  95. local shift = 2 ^ (8 - byte_count)
  96. char_byte = char_byte - math.floor(0xFF / shift) * shift
  97. unicode = char_byte * (2 ^ 6) ^ (byte_count - 1)
  98. end
  99. for j = 2, byte_count do
  100. char_byte = str:byte(i + j - 1) - 0x80
  101. unicode = unicode + char_byte * (2 ^ 6) ^ (byte_count - j)
  102. end
  103. return round(unicode)
  104. end
  105. ---Convert Unicode code point to utf-8 string
  106. ---@param unicode integer
  107. ---@return string?
  108. local function unicode_to_utf8(unicode)
  109. if unicode < 0x80 then
  110. return string.char(unicode)
  111. else
  112. local byte_count
  113. if unicode < 0x800 then
  114. byte_count = 2
  115. elseif unicode < 0x10000 then
  116. byte_count = 3
  117. elseif unicode < 0x110000 then
  118. byte_count = 4
  119. else
  120. return
  121. end -- too big
  122. local res = {}
  123. local shift = 2 ^ 6
  124. local after_shift = unicode
  125. for _ = byte_count, 2, -1 do
  126. local before_shift = after_shift
  127. after_shift = math.floor(before_shift / shift)
  128. table.insert(res, 1, before_shift - after_shift * shift + 0x80)
  129. end
  130. shift = 2 ^ (8 - byte_count)
  131. table.insert(res, 1, after_shift + math.floor(0xFF / shift) * shift)
  132. ---@diagnostic disable-next-line: deprecated
  133. return string.char(unpack(res))
  134. end
  135. end
  136. ---Update osd resolution if valid
  137. ---@param width integer
  138. ---@param height integer
  139. local function update_osd_resolution(width, height)
  140. if width > 0 and height > 0 then osd_width, osd_height = width, height end
  141. end
  142. mp.observe_property('osd-dimensions', 'native', function(_, dim)
  143. if dim then update_osd_resolution(dim.w, dim.h) end
  144. end)
  145. local measure_bounds
  146. do
  147. local text_osd = mp.create_osd_overlay('ass-events')
  148. text_osd.compute_bounds, text_osd.hidden = true, true
  149. ---@param ass_text string
  150. ---@return integer, integer, integer, integer
  151. measure_bounds = function(ass_text)
  152. update_osd_resolution(mp.get_osd_size())
  153. text_osd.res_x, text_osd.res_y = osd_width, osd_height
  154. text_osd.data = ass_text
  155. local res = text_osd:update()
  156. return res.x0, res.y0, res.x1, res.y1
  157. end
  158. end
  159. local normalized_text_width
  160. do
  161. ---@type {wrap: integer; bold: boolean; italic: boolean, rotate: number; size: number}
  162. local bounds_opts = {wrap = 2, bold = false, italic = false, rotate = 0, size = 0}
  163. ---Measure text width and normalize to a font size of 1
  164. ---text has to be ass safe
  165. ---@param text string
  166. ---@param size number
  167. ---@param bold boolean
  168. ---@param italic boolean
  169. ---@param horizontal boolean
  170. ---@return number, integer
  171. normalized_text_width = function(text, size, bold, italic, horizontal)
  172. bounds_opts.bold, bounds_opts.italic, bounds_opts.rotate = bold, italic, horizontal and 0 or -90
  173. local x1, y1 = nil, nil
  174. size = size / 0.8
  175. -- prevent endless loop
  176. local repetitions_left = 5
  177. repeat
  178. size = size * 0.8
  179. bounds_opts.size = size
  180. local ass = assdraw.ass_new()
  181. ass:txt(0, 0, horizontal and 7 or 1, text, bounds_opts)
  182. _, _, x1, y1 = measure_bounds(ass.text)
  183. repetitions_left = repetitions_left - 1
  184. -- make sure nothing got clipped
  185. until (x1 and x1 < osd_width and y1 < osd_height) or repetitions_left == 0
  186. local width = (repetitions_left == 0 and not x1) and 0 or (horizontal and x1 or y1)
  187. return width / size, horizontal and osd_width or osd_height
  188. end
  189. end
  190. ---Estimates character length based on utf8 byte count
  191. ---1 character length is roughly the size of a latin character
  192. ---@param char string
  193. ---@return number
  194. local function char_length(char)
  195. return #char > 2 and 2 or 1
  196. end
  197. ---Estimates string length based on utf8 byte count
  198. ---Note: Making a string in the iterator with the character is a waste here,
  199. ---but as this function is only used when measuring whole string widths it's fine
  200. ---@param text string
  201. ---@return number
  202. local function text_length(text)
  203. if not text or text == '' then return 0 end
  204. local text_length = 0
  205. for _, char in utf8_iter(tostring(text)) do text_length = text_length + char_length(char) end
  206. return text_length
  207. end
  208. ---Finds the best orientation of text on screen and returns the estimated max size
  209. ---and if the text should be drawn horizontally
  210. ---@param text string
  211. ---@return number, boolean
  212. local function fit_on_screen(text)
  213. local estimated_width = text_length(text) * width_length_ratio
  214. if osd_width >= osd_height then
  215. -- Fill the screen as much as we can, bigger is more accurate.
  216. return math.min(osd_width / estimated_width, osd_height), true
  217. else
  218. return math.min(osd_height / estimated_width, osd_width), false
  219. end
  220. end
  221. ---Gets next stage from cache
  222. ---@param cache {[any]: table}
  223. ---@param value any
  224. local function get_cache_stage(cache, value)
  225. local stage = cache[value]
  226. if not stage then
  227. stage = {}
  228. cache[value] = stage
  229. end
  230. return stage
  231. end
  232. ---Is measured resolution sufficient
  233. ---@param px integer
  234. ---@return boolean
  235. local function no_remeasure_required(px)
  236. return px >= 800 or (px * 1.1 >= osd_width and px * 1.1 >= osd_height)
  237. end
  238. local character_width
  239. do
  240. ---@type {[boolean]: {[string]: {[1]: number, [2]: integer}}}
  241. local char_width_cache = {}
  242. ---Get measured width of character
  243. ---@param char string
  244. ---@param bold boolean
  245. ---@return number, integer
  246. character_width = function(char, bold)
  247. ---@type {[string]: {[1]: number, [2]: integer}}
  248. local char_widths = get_cache_stage(char_width_cache, bold)
  249. local width_px = char_widths[char]
  250. if width_px and no_remeasure_required(width_px[2]) then return width_px[1], width_px[2] end
  251. local unicode = utf8_to_unicode(char, 1)
  252. for _, block in ipairs(zero_width_blocks) do
  253. if unicode >= block[1] and unicode <= block[2] then
  254. char_widths[char] = {0, math.huge}
  255. return 0, math.huge
  256. end
  257. end
  258. local measured_char = nil
  259. for _, block in ipairs(same_width_blocks) do
  260. if unicode >= block[1] and unicode <= block[2] then
  261. measured_char = unicode_to_utf8(block[1])
  262. width_px = char_widths[measured_char]
  263. if width_px and no_remeasure_required(width_px[2]) then
  264. char_widths[char] = width_px
  265. return width_px[1], width_px[2]
  266. end
  267. break
  268. end
  269. end
  270. if not measured_char then measured_char = char end
  271. -- half as many repetitions for wide characters
  272. local char_count = 10 / char_length(char)
  273. local max_size, horizontal = fit_on_screen(measured_char:rep(char_count))
  274. local size = math.min(max_size * 0.9, 50)
  275. char_count = math.min(math.floor(char_count * max_size / size * 0.8), 100)
  276. local enclosing_char, enclosing_width, next_char_count = '|', 0, char_count
  277. if measured_char == enclosing_char then
  278. enclosing_char = ''
  279. else
  280. enclosing_width = 2 * character_width(enclosing_char, bold)
  281. end
  282. local width_ratio, width, px = nil, nil, nil
  283. repeat
  284. char_count = next_char_count
  285. local str = enclosing_char .. measured_char:rep(char_count) .. enclosing_char
  286. width, px = normalized_text_width(str, size, bold, false, horizontal)
  287. width = width - enclosing_width
  288. width_ratio = width * size / (horizontal and osd_width or osd_height)
  289. next_char_count = math.min(math.floor(char_count / width_ratio * 0.9), 100)
  290. until width_ratio < 0.05 or width_ratio > 0.5 or char_count == next_char_count
  291. width = width / char_count
  292. width_px = {width, px}
  293. if char ~= measured_char then char_widths[measured_char] = width_px end
  294. char_widths[char] = width_px
  295. return width, px
  296. end
  297. end
  298. ---Calculate text width from individual measured characters
  299. ---@param text string|number
  300. ---@param bold boolean
  301. ---@return number, integer
  302. local function character_based_width(text, bold)
  303. local max_width = 0
  304. local min_px = math.huge
  305. for line in tostring(text):gmatch('([^\n]*)\n?') do
  306. local total_width = 0
  307. for _, char in utf8_iter(line) do
  308. local width, px = character_width(char, bold)
  309. total_width = total_width + width
  310. if px < min_px then min_px = px end
  311. end
  312. if total_width > max_width then max_width = total_width end
  313. end
  314. return max_width, min_px
  315. end
  316. ---Measure width of whole text
  317. ---@param text string|number
  318. ---@param bold boolean
  319. ---@param italic boolean
  320. ---@return number, integer
  321. local function whole_text_width(text, bold, italic)
  322. text = tostring(text)
  323. local size, horizontal = fit_on_screen(text)
  324. return normalized_text_width(ass_escape(text), size * 0.9, bold, italic, horizontal)
  325. end
  326. ---Scale normalized width to real width based on font size and italic
  327. ---@param opts {size: number; italic?: boolean}
  328. ---@return number, number
  329. local function opts_factor_offset(opts)
  330. return opts.size, opts.italic and opts.size * 0.2 or 0
  331. end
  332. ---Scale normalized width to real width based on font size and italic
  333. ---@param opts {size: number; italic?: boolean}
  334. ---@return number
  335. local function normalized_to_real(width, opts)
  336. local factor, offset = opts_factor_offset(opts)
  337. return factor * width + offset
  338. end
  339. do
  340. ---@type {[boolean]: {[boolean]: {[string|number]: {[1]: number, [2]: integer}}}} | {[boolean]: {[string|number]: {[1]: number, [2]: integer}}}
  341. local width_cache = {}
  342. ---Calculate width of text with the given opts
  343. ---@param text string|number
  344. ---@return number
  345. ---@param opts {size: number; bold?: boolean; italic?: boolean}
  346. function text_width(text, opts)
  347. if not text or text == '' then return 0 end
  348. ---@type boolean, boolean
  349. local bold, italic = opts.bold or options.font_bold, opts.italic or false
  350. if config.refine.text_width then
  351. ---@type {[string|number]: {[1]: number, [2]: integer}}
  352. local text_width = get_cache_stage(width_cache, bold)
  353. local width_px = text_width[text]
  354. if width_px and no_remeasure_required(width_px[2]) then return normalized_to_real(width_px[1], opts) end
  355. local width, px = character_based_width(text, bold)
  356. width_cache[bold][text] = {width, px}
  357. return normalized_to_real(width, opts)
  358. else
  359. ---@type {[string|number]: {[1]: number, [2]: integer}}
  360. local text_width = get_cache_stage(get_cache_stage(width_cache, bold), italic)
  361. local width_px = text_width[text]
  362. if width_px and no_remeasure_required(width_px[2]) then return width_px[1] * opts.size end
  363. local width, px = whole_text_width(text, bold, italic)
  364. width_cache[bold][italic][text] = {width, px}
  365. return width * opts.size
  366. end
  367. end
  368. end
  369. do
  370. ---@type {[string]: string}
  371. local cache = {}
  372. function timestamp_zero_rep_clear_cache()
  373. cache = {}
  374. end
  375. ---Replace all timestamp digits with 0
  376. ---@param timestamp string
  377. function timestamp_zero_rep(timestamp)
  378. local substitute = cache[#timestamp]
  379. if not substitute then
  380. substitute = timestamp:gsub('%d', '0')
  381. cache[#timestamp] = substitute
  382. end
  383. return substitute
  384. end
  385. ---Get width of formatted timestamp as if all the digits were replaced with 0
  386. ---@param timestamp string
  387. ---@param opts {size: number; bold?: boolean; italic?: boolean}
  388. ---@return number
  389. function timestamp_width(timestamp, opts)
  390. return text_width(timestamp_zero_rep(timestamp), opts)
  391. end
  392. end
  393. do
  394. local wrap_at_chars = {' ', ' ', '-', '–'}
  395. local remove_when_wrap = {' ', ' '}
  396. ---Wrap the text at the closest opportunity to target_line_length
  397. ---@param text string
  398. ---@param opts {size: number; bold?: boolean; italic?: boolean}
  399. ---@param target_line_length number
  400. ---@return string, integer
  401. function wrap_text(text, opts, target_line_length)
  402. local target_line_width = target_line_length * width_length_ratio * opts.size
  403. local bold, scale_factor, scale_offset = opts.bold or false, opts_factor_offset(opts)
  404. local wrap_at_chars, remove_when_wrap = wrap_at_chars, remove_when_wrap
  405. local lines = {}
  406. for _, text_line in ipairs(split(text, '\n')) do
  407. local line_width = scale_offset
  408. local line_start = 1
  409. local before_end = nil
  410. local before_width = scale_offset
  411. local before_line_start = 0
  412. local before_removed_width = 0
  413. for char_start, char in utf8_iter(text_line) do
  414. local char_end = char_start + #char - 1
  415. local char_width = character_width(char, bold) * scale_factor
  416. line_width = line_width + char_width
  417. if (char_end == #text_line) or itable_has(wrap_at_chars, char) then
  418. local remove = itable_has(remove_when_wrap, char)
  419. local line_width_after_remove = line_width - (remove and char_width or 0)
  420. if line_width_after_remove < target_line_width then
  421. before_end = remove and char_start - 1 or char_end
  422. before_width = line_width_after_remove
  423. before_line_start = char_end + 1
  424. before_removed_width = remove and char_width or 0
  425. else
  426. if (target_line_width - before_width) <
  427. (line_width_after_remove - target_line_width) then
  428. lines[#lines + 1] = text_line:sub(line_start, before_end)
  429. line_start = before_line_start
  430. line_width = line_width - before_width - before_removed_width + scale_offset
  431. else
  432. lines[#lines + 1] = text_line:sub(line_start, remove and char_start - 1 or char_end)
  433. line_start = char_end + 1
  434. line_width = scale_offset
  435. end
  436. before_end = line_start
  437. before_width = scale_offset
  438. end
  439. end
  440. end
  441. if #text_line >= line_start then
  442. lines[#lines + 1] = text_line:sub(line_start)
  443. elseif text_line == '' then
  444. lines[#lines + 1] = ''
  445. end
  446. end
  447. return table.concat(lines, '\n'), #lines
  448. end
  449. end
  450. do
  451. local word_separators = create_set({
  452. ' ', ' ', '\t', '-', '–', '_', ',', '.', '+', '&', '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\',
  453. '(', ')', '【', '】', ';', ':', '《', '》', '“', '”', '‘', '’', '?', '!',
  454. })
  455. ---Get the first character of each word
  456. ---@param str string
  457. ---@return string[]
  458. function initials(str)
  459. local initials, is_word_start, word_separators = {}, true, word_separators
  460. for _, char in utf8_iter(str) do
  461. if word_separators[char] then
  462. is_word_start = true
  463. elseif is_word_start then
  464. initials[#initials + 1] = char
  465. is_word_start = false
  466. end
  467. end
  468. return initials
  469. end
  470. end