export type ConvertParam = {
  text: string;
  voiceName: string;
  speechRate: string;
};

type PhonemeConvertParam = {
  phoneme: string;
  literal: string;
  voiceName: string;
  speechRate: string;
};

type WordPronounciationConvertParam = ConvertParam & {
  wordPronounciations: { literal: string; phoneme: string }[];
};

export function text2Ssml({
  text,
  voiceName,
  speechRate,
}: ConvertParam): string {
  // e.g. <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ja-JP"></speak>
  const xml = ssmlDocumentWithExpressAs({ voiceName, speechRate });
  const expressAs = xml.querySelector("mstts\\:express-as");
  expressAs.appendChild(xml.createTextNode(text.trim()));
  return toSsmlString(xml);
}

export function textWithWordPronounciations2Ssml({
  text,
  voiceName,
  speechRate,
  wordPronounciations,
}: WordPronounciationConvertParam): string {
  const sortedWordPronounciations = wordPronounciations.sort((a, b) =>
    b.literal.localeCompare(a.literal)
  );
  const xml = ssmlDocumentWithExpressAs({ voiceName, speechRate });
  const textAndPhonemeNodes = sortedWordPronounciations.reduce(
    (acc, { literal, phoneme }) => {
      return acc.flatMap((node) => {
        if (typeof node !== "string") {
          return node;
        }
        return node
          .split(literal) // 置換対象が失われ、その位置で分割された配列
          .reduce((result, node, index, srcArr) => {
            // 先頭、末尾以外の間(失われた置換対象の場所)に置換後のXML要素を挿入
            result.push(node);
            if (index < srcArr.length - 1) {
              result.push(createPhonemeElement(xml, phoneme, literal));
            }
            return result;
          }, [])
          .filter(Boolean);
      });
    },
    [text] as (string | HTMLElement)[]
  );
  const expressAs = xml.querySelector("mstts\\:express-as");
  textAndPhonemeNodes.forEach((node) => {
    if (typeof node === "string") {
      expressAs.appendChild(xml.createTextNode(node));
    } else {
      expressAs.appendChild(node);
    }
  });

  return toSsmlString(xml);
}

export function phoneme2Ssml({
  phoneme,
  literal,
  voiceName,
  speechRate,
}: PhonemeConvertParam): string {
  const xml = ssmlDocumentWithExpressAs({
    voiceName,
    speechRate,
  });
  const phonemeEl = createPhonemeElement(xml, phoneme, literal);
  xml.querySelector("mstts\\:express-as").appendChild(phonemeEl);
  return toSsmlString(xml);
}

function createPhonemeElement(
  xml: XMLDocument,
  phoneme: string,
  literal: string
) {
  const phonemeEl = xml.createElement("phoneme");
  phonemeEl.setAttribute("alphabet", "sapi");
  phonemeEl.setAttribute("ph", phoneme);
  phonemeEl.textContent = literal;
  return phonemeEl;
}

function ssmlDocumentWithExpressAs({
  voiceName,
  speechRate,
}: Omit<ConvertParam, "text">) {
  // e.g. <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ja-JP"></speak>
  const xml = document.implementation.createDocument(
    "http://www.w3.org/2001/10/synthesis",
    "speak"
  );
  const root = xml.firstElementChild;
  root.setAttribute("version", "1.0");
  root.setAttribute("xml:lang", "ja-JP");
  root.setAttribute("xmlns:mstts", "https://www.w3.org/2001/mstts");
  const voice = xml.createElement("voice");
  voice.setAttribute("name", voiceName);
  const prosody = xml.createElement("prosody");
  prosody.setAttribute("rate", speechRate);
  const expressAs = xml.createElement("mstts:express-as");
  expressAs.setAttribute("style", "cheerful");
  prosody.appendChild(expressAs);
  voice.appendChild(prosody);
  root.appendChild(voice);
  return xml;
}

function toSsmlString(xml: XMLDocument): string {
  return new XMLSerializer()
    .serializeToString(xml.getRootNode())
    .replaceAll(` xmlns=""`, ""); // 不要な属性出力を除去;
}
