import axios from 'axios';

// Azure OpenAI configuration for vision AI
const API_KEY = process.env.REACT_APP_AZURE_OPENAI_API_KEY;
const AZURE_ENDPOINT = process.env.REACT_APP_AZURE_OPENAI_ENDPOINT || 'https://avajune.openai.azure.com/openai/deployments/AvaJune/chat/completions?api-version=2024-10-21';

// Hardcoded API key for the realtime endpoint (for development only)
const OPENAI_REALTIME_API_KEY = "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";

let isStreamingImages = false;      // Controls continuous image capture
let latestVisionResult = "";        // Stores the most recent vision response (detailed summary)
let realtimeDataChannel = null;     // Reference to the active RTCDataChannel
let currentRealtimeSession = null;  // Holds current RTCPeerConnection and related info

// ---------------------------------------------------------------------
// startRealtimeAudioSession
// Establishes a WebRTC connection for realtime voice interaction.
// ---------------------------------------------------------------------
export const startRealtimeAudioSession = async () => {
  const model = "gpt-4o-realtime-preview-2024-12-17";
  const baseUrl = "https://api.openai.com/v1/realtime";

  const pc = new RTCPeerConnection();
  const audioEl = document.createElement("audio");
  audioEl.autoplay = true;
  pc.ontrack = (event) => {
    if (event.streams && event.streams[0]) {
      audioEl.srcObject = event.streams[0];
    }
  };

  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  stream.getTracks().forEach((track) => pc.addTrack(track, stream));

  const dc = pc.createDataChannel("oai-events");
  realtimeDataChannel = dc;
  dc.onmessage = (e) => {
    console.log("Realtime API event:", e.data);
  };

  const offer = await pc.createOffer();
  await pc.setLocalDescription(offer);

  const sdpResponse = await fetch(`${baseUrl}?model=${model}`, {
    method: "POST",
    body: offer.sdp,
    headers: {
      Authorization: `Bearer ${OPENAI_REALTIME_API_KEY}`,
      "Content-Type": "application/sdp"
    }
  });
  const answerSdp = await sdpResponse.text();
  const answer = { type: "answer", sdp: answerSdp };
  await pc.setRemoteDescription(answer);

  currentRealtimeSession = { pc, dc, audioEl };
  return currentRealtimeSession;
};

// ---------------------------------------------------------------------
// stopRealtimeSession
// Closes the current realtime session and cleans up resources.
// ---------------------------------------------------------------------
export const stopRealtimeSession = () => {
  if (currentRealtimeSession) {
    if (currentRealtimeSession.pc) {
      currentRealtimeSession.pc.close();
    }
    realtimeDataChannel = null;
    currentRealtimeSession = null;
  }
};

// ---------------------------------------------------------------------
// waitForDataChannelOpen
// Waits until the RTCDataChannel is open before resolving.
// ---------------------------------------------------------------------
const waitForDataChannelOpen = (dc) => {
  return new Promise((resolve) => {
    if (dc.readyState === "open") {
      resolve();
    } else {
      dc.onopen = () => resolve();
    }
  });
};

// ---------------------------------------------------------------------
// sendSystemInstructions
// Sends system instructions over the data channel to update Ava's context.
// The instructions tell Ava that she has continuous visual context and 
// provides a concise summary from the vision subsystem.
// ---------------------------------------------------------------------
const sendSystemInstructions = (dc, visionContext = "") => {
  const baseMessage = {
    type: "conversation.item.create",
    item: {
      type: "message",
      role: "system",
      content: [
        {
          type: "input_text",
          text:
            "You are Ava, a realtime assistant. Your name is Ava. You are expert in fashion, skincare and makeup. You have continuous visual context from our vision subsystem. Use the provided summary to inform your responses. you can also use this summary to perform skin analysis or anything that user request if the information is available with your vision subset"
        }
      ]
    }
  };
  dc.send(JSON.stringify(baseMessage));
  if (visionContext) {
    const visionMessage = {
      type: "conversation.item.create",
      item: {
        type: "message",
        role: "system",
        content: [
          { type: "input_text", text: `Visual context: ${visionContext}` }
        ]
      }
    };
    dc.send(JSON.stringify(visionMessage));
  }
};

// ---------------------------------------------------------------------
// startImageStreaming
// Continuously captures images from the video feed every 2 seconds,
// sends them to the vision API, and updates the latest visual context.
// The vision prompt now instructs the system to perform a detailed analysis
// that covers faces, clothing, makeup, skincare, and overall scene context.
// ---------------------------------------------------------------------
const startImageStreaming = async (videoElement) => {
  if (isStreamingImages || !videoElement) return;
  isStreamingImages = true;

  while (isStreamingImages) {
    try {
      if (videoElement instanceof HTMLVideoElement && videoElement.readyState >= HTMLMediaElement.HAVE_CURRENT_DATA) {
        const imageData = await captureImage(videoElement);
        const visionResponse = await callChatForImage(
          "Analyze the provided image in detail. If a person is present, provide an in-depth analysis including facial features (skin tone, facial structure, expression), hair details (style, color, texture, length), makeup, accessories (glasses, jewelry), and eye color. Also, describe any clothing or fashion items along with the overall scene context (dominant colors, textures, lighting, ambiance, and setting). If no person is present, describe the main objects and scene context. Return a concise summary with essential details.",
          imageData
        );
        console.log("Vision AI Response:", visionResponse);
        latestVisionResult = visionResponse;
        if (realtimeDataChannel && realtimeDataChannel.readyState === "open") {
          sendSystemInstructions(realtimeDataChannel, latestVisionResult);
        }
      } else {
        console.log("Video element not ready for capture, waiting...");
      }
    } catch (error) {
      console.error("Error streaming image:", error);
    }
    await new Promise((r) => setTimeout(r, 2000));
  }
};

// ---------------------------------------------------------------------
// stopImageStreaming
// Stops continuous image streaming.
// ---------------------------------------------------------------------
export const stopImageStreaming = () => {
  isStreamingImages = false;
};

// ---------------------------------------------------------------------
// handleRealtimeAudioInteraction
// Listens for "response.done" events on the data channel and processes them.
// (On-demand vision requests are not active in this version.)
// ---------------------------------------------------------------------
export const handleRealtimeAudioInteraction = async (dc, videoElement) => {
  return new Promise((resolve, reject) => {
    dc.onmessage = async (event) => {
      try {
        const data = JSON.parse(event.data);
        if (data.type === "response.done") {
          const output = data.response.output[0];
          resolve({ content: output.text || output.content || "" });
        }
      } catch (e) {
        reject(e);
      }
    };
    setTimeout(() => {
      reject("Realtime audio interaction timed out.");
    }, 30000);
  });
};

// ---------------------------------------------------------------------
// handleVoiceInteraction
// Initializes realtime audio, sends system instructions (with current vision context),
// and starts continuous image streaming.
// ---------------------------------------------------------------------
export const handleVoiceInteraction = async (videoElement) => {
  try {
    const { dc } = await startRealtimeAudioSession();
    await waitForDataChannelOpen(dc);
    sendSystemInstructions(dc, latestVisionResult);
    startImageStreaming(videoElement);
    // Audio response is automatically handled via the remote audio track.
  } catch (error) {
    console.error("Error in realtime audio interaction:", error);
    throw error;
  }
};

// ---------------------------------------------------------------------
// callChatForImage
// Uses the Azure OpenAI endpoint to process an image query.
// Prepend a system message instructing the vision AI to perform a comprehensive analysis.
// ---------------------------------------------------------------------
export const callChatForImage = async (question, imageData) => {
  try {
    const response = await axios.post(
      AZURE_ENDPOINT,
      {
        messages: [
          {
            role: "system",
            content:
              "You are the vision subsystem, an integral part of our multimodal system integrated with realtime voice. Your role is to analyze the provided image in exhaustive detail and return a concise yet comprehensive summary for Ava (the realtime AI). For each image, follow these guidelines:\n\n1. If a person is present:\n   - Perform a thorough analysis of facial features, including skin tone, skin texture (noting any blemishes, freckles, or signs of aging), facial structure, and expression.\n   - Analyze the skin in detail, highlighting any unique features or conditions.\n   - Provide detailed information about the hair: style, color, texture, and length.\n   - Examine makeup and accessories, noting if the person is wearing makeup, glasses, jewelry, or other adornments.\n   - Identify eye details, such as color and shape, and include any noticeable traits.\n   - If possible, estimate the person's gender and approximate age.\n   - Describe any visible clothing or fashion items, including style, color, fabric, and design patterns.\n\n2. Regardless of a person's presence:\n   - Describe the overall scene context, including dominant colors, textures, lighting conditions, ambiance, and setting (e.g., indoor/outdoor, modern, vintage).\n   - If no person is detected, focus on the key objects and the environment.\n\nReturn a detailed summary that includes these essential aspects, ensuring Ava receives all relevant visual context for digital fashion, makeup, and skincare guidance."
          }
          ,
          {
            role: "user",
            content: [
              { type: "text", text: question },
              { type: "image_url", image_url: { url: imageData } }
            ]
          }
        ],
        max_tokens: 300
      },
      {
        headers: {
          "Content-Type": "application/json",
          "api-key": API_KEY
        }
      }
    );
    return response.data.choices[0].message.content;
  } catch (error) {
    console.error("Error calling Chat API for image:", error.response ? error.response.data : error.message);
    throw error;
  }
};

// ---------------------------------------------------------------------
// captureImage
// Captures an image from the provided video element and returns a Base64 string.
// ---------------------------------------------------------------------
export const captureImage = (videoElement) => {
  return new Promise((resolve, reject) => {
    if (!videoElement || !(videoElement instanceof HTMLVideoElement)) {
      reject('Invalid video element provided');
      return;
    }
    if (videoElement.readyState < HTMLMediaElement.HAVE_CURRENT_DATA) {
      reject('Video element not ready');
      return;
    }
    const canvas = document.createElement('canvas');
    canvas.width = videoElement.videoWidth;
    canvas.height = videoElement.videoHeight;
    canvas.getContext('2d').drawImage(videoElement, 0, 0);
    canvas.toBlob((blob) => {
      const reader = new FileReader();
      reader.onloadend = () => {
        resolve(reader.result);
      };
      reader.onerror = reject;
      reader.readAsDataURL(blob);
    }, 'image/jpeg', 0.8);
  });
};

// ---------------------------------------------------------------------
// setupCamera
// Sets up the camera for capturing video and waits for it to be ready.
// ---------------------------------------------------------------------
export const setupCamera = async (videoElement, facingMode = 'user') => {
  if (!videoElement) return;
  if (videoElement.srcObject) {
    videoElement.srcObject.getTracks().forEach(track => track.stop());
  }
  try {
    const stream = await navigator.mediaDevices.getUserMedia({
      video: { facingMode }
    });
    videoElement.srcObject = stream;
    await new Promise((resolve) => {
      videoElement.onloadeddata = () => resolve();
    });
    await videoElement.play();
  } catch (error) {
    console.error('Error accessing the camera:', error);
  }
};
