@Article{info:doi/10.2196/71176, author="Balch, A. Jeremy and Desaraju, S. Sasank and Nolan, J. Victoria and Vellanki, Divya and Buchanan, R. Timothy and Brinkley, M. Lindsey and Penev, Yordan and Bilgili, Ahmet and Patel, Aashay and Chatham, E. Corinne and Vanderbilt, M. David and Uddin, Rayon and Bihorac, Azra and Efron, Philip and Loftus, J. Tyler and Rahman, Protiva and Shickel, Benjamin", title="Language Models for Multilabel Document Classification of Surgical Concepts in Exploratory Laparotomy Operative Notes: Algorithm Development Study", journal="JMIR Med Inform", year="2025", month="Jul", day="9", volume="13", pages="e71176", keywords="chart review", keywords="generative large language models", keywords="general surgery", keywords="natural language processing", keywords="exploratory laparotomy", abstract="Background: Operative notes are frequently mined for surgical concepts in clinical care, research, quality improvement, and billing, often requiring hours of manual extraction. These notes are typically analyzed at the document level to determine the presence or absence of specific procedures or findings (eg, whether a hand-sewn anastomosis was performed or contamination occurred). Extracting several binary classification labels simultaneously is a multilabel classification problem. Traditional natural language processing approaches---bag-of-words (BoW) and term frequency-inverse document frequency (tf-idf) with linear classifiers---have been used previously for this task but are now being augmented or replaced by large language models (LLMs). However, few studies have examined their utility in surgery. Objective: We developed and evaluated LLMs for the purpose of expediting data extraction from surgical notes. Methods: A total of 388 exploratory laparotomy notes from a single institution were annotated for 21 concepts related to intraoperative findings, intraoperative techniques, and closure techniques. Annotation consistency was measured using the Cohen $\kappa$ statistic. Data were preprocessed to include only the description of the procedure. We compared the evolution of document classification technologies from BoW and tf-idf to encoder-only (Clinical-Longformer) and decoder-only (Llama 3) transformer models. Multilabel classification performance was evaluated with 5-fold cross-validation with F1-score and hamming loss (HL). We experimented with and without context. Errors were assessed by manual review. Code and implementation instructions may be found on GitHub. Results: The prevalence of labels ranged from 0.05 (colostomy, ileostomy, active bleed from named vessel) to 0.50 (running fascial closure). Llama 3.3 was the overall best-performing model (micro F1-score 0.88, 5-fold range: 0.88-0.89; HL 0.11, 5-fold range: 0.11-0.12). The BoW model (micro F1-score 0.68, 5-fold range: 0.64-0.71; HL 0.14, 5-fold range: 0.13-0.16) and Clinical-Longformer (micro F1-score 0.73, 5-fold range: 0.70-0.74; HL 0.11, 5-fold range: 0.10-0.12) had overall similar performance, with tf-idf models trailing (micro F1-score 0.57, 5-fold range: 0.55-0.59; HL 0.27, 5-fold range: 0.25-0.29). F1-scores varied across concepts in the Llama model, ranging from 0.30 (5-fold range: 0.23-0.39) for class III contamination to 0.92 (5-fold range: 0.98-0.84) for bowel resection. Context enhanced Llama's performance, adding an average of 0.16 improvement to the F1-scores. Error analysis demonstrated semantic nuances and edge cases within operative notes, particularly when patients had references to prior operations in their operative notes or simultaneous operations with other surgical services. Conclusions: Off-the-shelf autoregressive LLMs outperformed fined-tuned, encoder-only transformers and traditional natural language processing techniques in classifying operative notes. Multilabel classification with LLMs may streamline retrospective reviews in surgery, though further refinements are required prior to reliable use in research and quality improvement. ", doi="10.2196/71176", url="https://medinform.jmir.org/2025/1/e71176" } @Article{info:doi/10.2196/72815, author="Landerholm, August", title="AI in Qualitative Health Research Appraisal: Comparative Study", journal="JMIR Form Res", year="2025", month="Jul", day="8", volume="9", pages="e72815", keywords="artificial intelligence", keywords="qualitative research appraisal", keywords="systematic reviews", keywords="interrater agreement", keywords="CASP checklist", keywords="Critical Appraisal Skills Programme", keywords="JBI checklist", keywords="Joanna Briggs Institute", keywords="ETQS", keywords="Evaluative Tools for Qualitative Studies", keywords="large language models", keywords="affirmation bias", keywords="human-AI collaboration", abstract="Background: Qualitative research appraisal is crucial for ensuring credible findings but faces challenges due to human variability. Artificial intelligence (AI) models have the potential to enhance the efficiency and consistency of qualitative research assessments. Objective: This study aims to evaluate the performance of 5 AI models (GPT-3.5, Claude 3.5, Sonar Huge, GPT-4, and Claude 3 Opus) in assessing the quality of qualitative research using 3 standardized tools: Critical Appraisal Skills Programme (CASP), Joanna Briggs Institute (JBI) checklist, and Evaluative Tools for Qualitative Studies (ETQS). Methods: AI-generated assessments of 3 peer-reviewed qualitative papers in health and physical activity--related research were analyzed. The study examined systematic affirmation bias, interrater reliability, and tool-dependent disagreements across the AI models. Sensitivity analysis was conducted to evaluate the impact of excluding specific models on agreement levels. Results: Results revealed a systematic affirmation bias across all AI models, with ``Yes'' rates ranging from 75.9\% (145/191; Claude 3 Opus) to 85.4\% (164/192; Claude 3.5). GPT-4 diverged significantly, showing lower agreement (``Yes'': 115/192, 59.9\%) and higher uncertainty (``Cannot tell'': 69/192, 35.9\%). Proprietary models (GPT-3.5 and Claude 3.5) demonstrated near-perfect alignment (Cramer V=0.891; P<.001), while open-source models showed greater variability. Interrater reliability varied by assessment tool, with CASP achieving the highest baseline consensus (Krippendorff $\alpha$=0.653), followed by JBI ($\alpha$=0.477), and ETQS scoring lowest ($\alpha$=0.376). Sensitivity analysis revealed that excluding GPT-4 increased CASP agreement by 20\% ($\alpha$=0.784), while removing Sonar Huge improved JBI agreement by 18\% ($\alpha$=0.561). ETQS showed marginal improvements when excluding GPT-4 or Claude 3 Opus (+9\%, $\alpha$=0.409). Tool-dependent disagreements were evident, particularly in ETQS criteria, highlighting AI's current limitations in contextual interpretation. Conclusions: The findings demonstrate that AI models exhibit both promise and limitations as evaluators of qualitative research quality. While they enhance efficiency, AI models struggle with reaching consensus in areas requiring nuanced interpretation, particularly for contextual criteria. The study underscores the importance of hybrid frameworks that integrate AI scalability with human oversight, especially for contextual judgment. Future research should prioritize developing AI training protocols that emphasize qualitative epistemology, benchmarking AI performance against expert panels to validate accuracy thresholds, and establishing ethical guidelines for disclosing AI's role in systematic reviews. As qualitative methodologies evolve alongside AI capabilities, the path forward lies in collaborative human-AI workflows that leverage AI's efficiency while preserving human expertise for interpretive tasks. ", doi="10.2196/72815", url="https://formative.jmir.org/2025/1/e72815" } @Article{info:doi/10.2196/68776, author="Garcia-Carmona, Manuel Angel and Prieto, Maria-Lorena and Puertas, Enrique and Beunza, Juan-Jose", title="Leveraging Large Language Models for Accurate Retrieval of Patient Information From Medical Reports: Systematic Evaluation Study", journal="JMIR AI", year="2025", month="Jul", day="3", volume="4", pages="e68776", keywords="large language models", keywords="LangChain framework", keywords="electronic health records", keywords="data mining", keywords="model evaluation", keywords="health care", keywords="digitalization", abstract="Background: The digital transformation of health care has introduced both opportunities and challenges, particularly in managing and analyzing the vast amounts of unstructured medical data generated daily. There is a need to explore the feasibility of generative solutions in extracting data from medical reports, categorized by specific criteria. Objective: This study aimed to investigate the application of large language models (LLMs) for the automated extraction of structured information from unstructured medical reports, using the LangChain framework in Python. Methods: Through a systematic evaluation of leading LLMs---GPT-4o, Llama 3, Llama 3.1, Gemma 2, Qwen 2, and Qwen 2.5---using zero-shot prompting techniques and embedding results into a vector database, this study assessed the performance of LLMs in extracting patient demographics, diagnostic details, and pharmacological data. Results: Evaluation metrics, including accuracy, precision, recall, and F1-score, revealed high efficacy across most categories, with GPT-4o achieving the highest overall performance (91.4\% accuracy). Conclusions: The findings highlight notable differences in precision and recall between models, particularly in extracting names and age-related information. There were challenges in processing unstructured medical text, including variability in model performance across data types. Our findings demonstrate the feasibility of integrating LLMs into health care workflows; LLMs offer substantial improvements in data accessibility and support clinical decision-making processes. In addition, the paper describes the role of retrieval-augmented generation techniques in enhancing information retrieval accuracy, addressing issues such as hallucinations and outdated data in LLM outputs. Future work should explore the need for optimization through larger and more diverse training datasets, advanced prompting strategies, and the integration of domain-specific knowledge to improve model generalizability and precision. ", doi="10.2196/68776", url="https://ai.jmir.org/2025/1/e68776", url="http://www.ncbi.nlm.nih.gov/pubmed/40608403" } @Article{info:doi/10.2196/74423, author="Huang, Weihong and Wei, Wudi and He, Xiaotao and Zhan, Baili and Xie, Xiaoting and Zhang, Meng and Lai, Shiyi and Yuan, Zongxiang and Lai, Jingzhen and Chen, Rongfeng and Jiang, Junjun and Ye, Li and Liang, Hao", title="ChatGPT-Assisted Deep Learning Models for Influenza-Like Illness Prediction in Mainland China: Time Series Analysis", journal="J Med Internet Res", year="2025", month="Jun", day="27", volume="27", pages="e74423", keywords="time series analysis", keywords="epidemic forecasting", keywords="public health preparedness", keywords="model optimization", keywords="seasonal pattern", abstract="Background: Influenza in mainland China results in a large number of outpatient and emergency visits related to influenza-like illness (ILI) annually. While deep learning models show promise for improving influenza forecasting, their technical complexity remains a barrier to practical implementation. Large language models, such as ChatGPT, offer the potential to reduce these barriers by supporting automated code generation, debugging, and model optimization. Objective: This study aimed to evaluate the predictive performance of several deep learning models for ILI positive rates in mainland China and to explore the auxiliary role of ChatGPT-assisted development in facilitating model implementation. Methods: ILI positivity rate data spanning from 2014 to 2024 were obtained from the Chinese National Influenza Center (CNIC) database. In total, 5 deep learning architectures---long short-term memory (LSTM), neural basis expansion analysis for time series (N-BEATS), transformer, temporal fusion transformer (TFT), and time-series dense encoder (TiDE)---were developed using a ChatGPT-assisted workflow covering code generation, error debugging, and performance optimization. Models were trained on data from 2014 to 2023 and tested on holdout data from 2024 (weeks 1?39). Performance was evaluated using mean squared error (MSE), mean absolute error (MAE), and mean absolute percentage error (MAPE). Results: ILI trends exhibited clear seasonal patterns with winter peaks and summer troughs, alongside marked fluctuations during the COVID-19 pandemic period (2020?2022). All 5 deep learning models were successfully constructed, debugged, and optimized with the assistance of ChatGPT. Among the 5 models, TiDE achieved the best predictive performance nationally (MAE=5.551, MSE=43.976, MAPE=72.413\%) and in the southern region (MAE=7.554, MSE=89.708, MAPE=74.475\%). In the northern region, where forecasting proved more challenging, TiDE still performed best (MAE=4.131, MSE=28.922), although high percentage errors remained (MAPE>400\%). N-BEATS demonstrated the second-best performance nationally (MAE=9.423) and showed greater stability in the north (MAE=6.325). In contrast, transformer and TFT consistently underperformed, with national MAE values of 10.613 and 12.538, respectively. TFT exhibited the highest deviation (national MAPE=169.29\%). Extreme regional disparities were observed, particularly in northern China, where LSTM and TFT generated MAPE values exceeding 1918\%, despite LSTM's moderate performance in the south (MAE=9.460). Conclusions: Deep learning models, particularly TiDE, demonstrate strong potential for accurate ILI forecasting across diverse regions of China. Furthermore, large language models like ChatGPT can substantially enhance modeling efficiency and accessibility by assisting nontechnical users in model development. These findings support the integration of AI-assisted workflows into epidemic prediction systems as a scalable approach for improving public health preparedness. ", doi="10.2196/74423", url="https://www.jmir.org/2025/1/e74423" } @Article{info:doi/10.2196/69149, author="Karlin, Bradley and Henry, Doug and Anderson, Ryan and Cieri, Salvatore and Aratow, Michael and Shriberg, Elizabeth and Hoy, Michelle", title="Digital Phenotyping for Detecting Depression Severity in a Large Payor-Provider System: Retrospective Study of Speech and Language Model Performance", journal="JMIR AI", year="2025", month="Jun", day="19", volume="4", pages="e69149", keywords="depression", keywords="vocal biomarkers", keywords="artificial intelligence", keywords="behavioral health", keywords="machine learning", keywords="health care case management", keywords="mobile phone", abstract="Background: There is considerable need to improve and increase the detection and measurement of depression. The use of speech as a digital biomarker of depression represents a considerable opportunity for transforming and accelerating depression identification and treatment; however, research to date has primarily consisted of small-sample feasibility or pilot studies incorporating highly controlled applications and settings. There has been limited examination of the technology in real-world use contexts. Objective: This study evaluated the performance of a machine learning (ML) model examining both semantic and acoustic properties of speech in predicting depression across more than 2000 real-world interactions between health plan members and case managers. Methods: A total of 2086 recordings of case management calls with verbally administered Patient Health Questionnaire---9 questions (PHQ-9) surveys were analyzed using the ML model after the portions of the recordings with the PHQ-9 survey were manually redacted. The recordings were divided into a Development Set (Dev Set) (n=1336) and a Blind Set (n=671), and Patient Health Questionnaire---8 questions (PHQ-8) scores were provided for the Dev Set for ML model refinement while PHQ-8 scores from the Blind Set were withheld until after ML model depression severity output was reported. Results: The Dev Set and the Blind Set were well matched for age (Dev Set: mean 53.7, SD 16.3 years; Blind Set: mean 51.7, SD 16.9 years), gender (Dev Set: 910/1336, 68.1\% of female participants; Blind Set: 462/671, 68.9\% of female participants), and depression severity (Dev Set: mean 10.5, SD 6.1 of PHQ-8 scores; Blind Set: mean 10.9, SD 6.0 of PHQ-8 scores). The concordance correlation coefficient was $\rho$c=0.57 for the test of the ML model on the Dev Set and $\rho$c=0.54 on the Blind Set, while the mean absolute error was 3.91 for the Dev Set and 4.06 for the Blind Set, demonstrating strong model performance. This performance was maintained when dividing each set into subgroups of age brackets (?39, 40?64, and ?65 years), biological sex, and the 4 categories of Social Vulnerability Index (an index based on 16 social factors), with concordance correlation coefficients ranging as $\rho$c=0.44?0.61. Performance at PHQ-8 threshold score cutoffs of 5, 10, 15, and 20, representing the depression severity categories of none, mild, moderate, moderately severe, and severe (?20), respectively, expressed as area under the receiver operating characteristic curve values, varied between 0.79 and 0.83 in both the Dev and Blind Sets. Conclusions: Overall, the findings suggest that speech may have significant potential for detection and measurement of depression severity over a variety of ages, gender, and socioeconomic categories that may enhance treatment, improve clinical decision-making, and enable truly personalized treatment recommendations. ", doi="10.2196/69149", url="https://ai.jmir.org/2025/1/e69149" } @Article{info:doi/10.2196/68212, author="Parker, T. Susan", title="Supervised Natural Language Processing Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model", journal="JMIR AI", year="2025", month="Jun", day="19", volume="4", pages="e68212", keywords="natural language processing", keywords="NLP", keywords="violence", keywords="informatics", keywords="text classification", keywords="simulation", keywords="violent death", keywords="narrative", keywords="large language model", keywords="LLM", keywords="injury prevention", keywords="violent injury", keywords="coroner reports", keywords="police report", abstract="Background: The recent availability of law enforcement and coroner or medical examiner reports for nearly every violent death in the United States expands the potential for natural language processing (NLP) research into violence. Objective: The objective of this work is to assess applications of supervised NLP to unstructured data in the National Violent Death Reporting System to predict circumstances and types of violent death. Methods: This analysis applied distilBERT, a compact large language model (LLM) with fewer parameters relative to full-scale LLMs, to unstructured narrative data to simulate the impacts of preprocessing, volume, and composition of training data on model performance, evaluated by F1-scores, precision, recall, and the false negative rate. Model performance was evaluated for bias by race, ethnicity, and sex by comparing F1-scores across subgroups. Results: A minimum training set of 1500 cases was necessary to achieve an F1-score of 0.6 and a false negative rate of 0.01-0.05 with a compact LLM. Replacement of domain-specific jargon improved model performance, while oversampling positive class cases to address class imbalance did not substantially improve F1-scores. Between racial and ethnic groups, F1-score disparities ranged from 0.2 to 0.25, and between male and female decedents, differences ranged from 0.12 to 0.2. Conclusions: Compact LLMs with sufficient training data can be applied to supervised NLP tasks with a class imbalance in the National Violent Death Reporting System. Simulations of supervised text classification across the model-fitting process of preprocessing and training compact LLM-informed NLP applications to unstructured death narrative data. ", doi="10.2196/68212", url="https://ai.jmir.org/2025/1/e68212" } @Article{info:doi/10.2196/70315, author="Leiser, Florian and Guse, Richard and Sunyaev, Ali", title="Large Language Model Architectures in Health Care: Scoping Review of Research Perspectives", journal="J Med Internet Res", year="2025", month="Jun", day="19", volume="27", pages="e70315", keywords="large language models", keywords="scoping review", keywords="ChatGPT", keywords="generative artificial intelligence", keywords="digital health", keywords="medical informatics", abstract="Background: Large language models (LLMs) can support health care professionals in their daily work, for example, when writing and filing reports or communicating diagnoses. With the rise of LLMs, current research investigates how LLMs could be applied in medical practice and their benefits for physicians in clinical workflows. However, most studies neglect the importance of selecting suitable LLM architectures. Objective: In this literature review, we aim to provide insights on the different LLM model architecture families (ie, Bidirectional Encoder Representations from Transformers [BERT]--based or generative pretrained transformer [GPT]--based models) used in previous research. We report on the suitability and benefits of different LLM model architecture families for various research foci. Methods: To this end, we conduct a scoping review to identify which LLMs are used in health care. Our search included manuscripts from PubMed, arXiv, and medRxiv. We used open and selective coding to assess the 114 identified manuscripts regarding 11 dimensions related to usage and technical facets and the research focus of the manuscripts. Results: We identified 4 research foci that emerged previously in manuscripts, with LLM performance being the main focus. We found that GPT-based models are used for communicative purposes such as examination preparation or patient interaction. In contrast, BERT-based models are used for medical tasks such as knowledge discovery and model improvements. Conclusions: Our study suggests that GPT-based models are better suited for communicative purposes such as report generation or patient interaction. BERT-based models seem to be better suited for innovative applications such as classification or knowledge discovery. This could be due to the architectural differences where GPT processes language unidirectionally and BERT bidirectionally, allowing more in-depth understanding of the text. In addition, BERT-based models seem to allow more straightforward extensions of their models for domain-specific tasks that generally lead to better results. In summary, health care professionals should consider the benefits and differences of the LLM architecture families when selecting a suitable model for their intended purpose. ", doi="10.2196/70315", url="https://www.jmir.org/2025/1/e70315", url="http://www.ncbi.nlm.nih.gov/pubmed/40536801" } @Article{info:doi/10.2196/66204, author="Knudsen, Ben and Madkour, Amr and Cholli, Preetam and Haslam, Alyson and Prasad, Vinay", title="Analysis of the Political Viewpoint of Policy Statements From Professional Medical Organizations Using ChatGPT With GPT-4: Cross-Sectional Study", journal="JMIR Form Res", year="2025", month="Jun", day="13", volume="9", pages="e66204", keywords="medical organizations", keywords="health policy", keywords="policy statements", keywords="public health", keywords="ChatGPT", keywords="AI", keywords="chatbot", keywords="political viewpoint", keywords="political alignment", keywords="political ideology", abstract="Background: Professional medical organizations publish policy statements that are used to impact legislation or address societal issues. Many organizations are nonpartisan, yet it is uncertain whether their policy statements balance liberal and conservative values. Objective: This study aims to evaluate the political viewpoint of policy statements from 6 influential medical organizations, including the American Academy of Pediatrics, American College of Surgeons, American Psychiatric Association, American College of Obstetricians and Gynecologists, American College of Physicians, and American Academy of Family Physicians. Methods: Between December 2023 and February 2024, policy statements from the 6 organizations were identified and evaluated using ChatGPT with GPT-4 to reduce bias. Each statement was pasted into a new ChatGPT session following the phrase ``Does this text align with a liberal or conservative viewpoint?'' Two authors reviewed each response and categorized the statement as liberal, probably liberal, neutral, probably conservative, or conservative. Results: One-third of policy statements (529/1592, 33.2\%) were found to be aligned with a political ideology. Among these 529 statements, 516 (97.5\%) were liberal or probably liberal and 13 (2.5\%) were conservative or probably conservative. For each organization, among policy statements with a political leaning, the percentage of liberal or probably liberal statements was as follows: 100\% (69/69) for the American Academy of Pediatrics, 100\% (24/24) for the American College of Obstetricians and Gynecologists, 100\% (12/12) for the American College of Surgeons, 99\% (72/73) for the American Psychiatric Association, 97\% (174/180) for the American Academy of Family Physicians, and 96\% (165/171) for the American College of Physicians. Conclusions: One in 3 policy statements from these 6 professional organizations align with a partisan political viewpoint. Among these, positions are 40 times more likely to be liberal or probably liberal than conservative or probably conservative. Whether or not organizations are politically neutral and seek viewpoint diversity warrants further exploration. ", doi="10.2196/66204", url="https://formative.jmir.org/2025/1/e66204" } @Article{info:doi/10.2196/72638, author="Li, Ronghao and Mao, Shuai and Zhu, Congmin and Yang, Yingliang and Tan, Chunting and Li, Li and Mu, Xiangdong and Liu, Honglei and Yang, Yuqing", title="Enhancing Pulmonary Disease Prediction Using Large Language Models With Feature Summarization and Hybrid Retrieval-Augmented Generation: Multicenter Methodological Study Based on Radiology Report", journal="J Med Internet Res", year="2025", month="Jun", day="11", volume="27", pages="e72638", keywords="retrieval-augmented generation", keywords="large language models", keywords="prompt engineering", keywords="pulmonary disease prediction", keywords="RAG", keywords="LLM", abstract="Background: The rapid advancements in natural language processing, particularly the development of large language models (LLMs), have opened new avenues for managing complex clinical text data. However, the inherent complexity and specificity of medical texts present significant challenges for the practical application of prompt engineering in diagnostic tasks. Objective: This paper explores LLMs with new prompt engineering technology to enhance model interpretability and improve the prediction performance of pulmonary disease based on a traditional deep learning model. Methods: A retrospective dataset including 2965 chest CT radiology reports was constructed. The reports were from 4 cohorts, namely, healthy individuals and patients with pulmonary tuberculosis, lung cancer, and pneumonia. Then, a novel prompt engineering strategy that integrates feature summarization (F-Sum), chain of thought (CoT) reasoning, and a hybrid retrieval-augmented generation (RAG) framework was proposed. A feature summarization approach, leveraging term frequency--inverse document frequency (TF-IDF) and K-means clustering, was used to extract and distill key radiological findings related to 3 diseases. Simultaneously, the hybrid RAG framework combined dense and sparse vector representations to enhance LLMs' comprehension of disease-related text. In total, 3 state-of-the-art LLMs, GLM-4-Plus, GLM-4-air (Zhipu AI), and GPT-4o (OpenAI), were integrated with the prompt strategy to evaluate the efficiency in recognizing pneumonia, tuberculosis, and lung cancer. The traditional deep learning model, BERT (Bidirectional Encoder Representations from Transformers), was also compared to assess the superiority of LLMs. Finally, the proposed method was tested on an external validation dataset consisted of 343 chest computed tomography (CT) report from another hospital. Results: Compared with BERT-based prediction model and various other prompt engineering techniques, our method with GLM-4-Plus achieved the best performance on test dataset, attaining an F1-score of 0.89 and accuracy of 0.89. On the external validation dataset, F1-score (0.86) and accuracy (0.92) of the proposed method with GPT-4o were the highest. Compared to the popular strategy with manually selected typical samples (few-shot) and CoT designed by doctors (F1-score=0.83 and accuracy=0.83), the proposed method that summarized disease characteristics (F-Sum) based on LLM and automatically generated CoT performed better (F1-score=0.89 and accuracy=0.90). Although the BERT-based model got similar results on the test dataset (F1-score=0.85 and accuracy=0.88), its predictive performance significantly decreased on the external validation set (F1-score=0.48 and accuracy=0.78). Conclusions: These findings highlight the potential of LLMs to revolutionize pulmonary disease prediction, particularly in resource-constrained settings, by surpassing traditional models in both accuracy and flexibility. The proposed prompt engineering strategy not only improves predictive performance but also enhances the adaptability of LLMs in complex medical contexts, offering a promising tool for advancing disease diagnosis and clinical decision-making. ", doi="10.2196/72638", url="https://www.jmir.org/2025/1/e72638" } @Article{info:doi/10.2196/67201, author="Zhang, Chi and Yang, Hao and Liu, Xingyun and Wu, Rongrong and Zong, Hui and Wu, Erman and Zhou, Yi and Li, Jiakun and Shen, Bairong", title="A Knowledge-Enhanced Platform (MetaSepsisKnowHub) for Retrieval Augmented Generation--Based Sepsis Heterogeneity and Personalized Management: Development Study", journal="J Med Internet Res", year="2025", month="Jun", day="6", volume="27", pages="e67201", keywords="human sepsis", keywords="knowledge-enhanced", keywords="personalized application", keywords="retrieval augmented generation", keywords="precision medicine", abstract="Background: Sepsis is a severe syndrome of organ dysfunction caused by infection; it has high heterogeneity and high in-hospital mortality, representing a grim clinical challenge for precision medicine in critical care. Objective: We aimed to extract reported sepsis biomarkers to provide users with comprehensive biomedical information and integrate retrieval augmented generation (RAG) and prompt engineering to enhance the accuracy, stability, and interpretability of clinical decisions recommended by large language models (LLMs). Methods: To address the challenge, we established and updated the first knowledge-enhanced platform, MetaSepsisKnowHub, comprising 427 sepsis biomarkers and 423 studies, aiming to systematically collect and annotate sepsis biomarkers to guide personalized clinical decision-making in the diagnosis and treatment of human sepsis. We curated a tailored LLM framework incorporating RAG and prompt engineering and incorporated 2 performance evaluation scales: the System Usability Scale and the Net Promoter Score. Results: The overall quantitative ratings of expert-reviewed clinical recommendations based on RAG surpassed baseline responses generated by 4 LLMs and showed a statistically significant improvement in textual questions (GPT-4: mean 75.79, SD 7.11 vs mean 81.59, SD 9.87; P=.02; GPT-4o: mean 70.36, SD 7.63 vs mean 77.98, SD 13.26; P=.02; Qwen2.5-instruct: mean 77.08 SD 3.75 vs mean 85.46, SD 7.27; P<.001; and DeepSeek-R1: mean 77.67, SD 3.66 vs mean 86.42, SD 8.56; P<.001), but no significant statistical differences could be measured in clinical scenarios. The RAG assessment score comparing RAG-based responses and expert-provided benchmark answers illustrated prominent factual correctness, accuracy, and knowledge recall compared to the baseline responses. After use, the average the System Usability Scale score was 82.20 (SD 14.17) and the Net Promoter Score was 72, demonstrating high user satisfaction and loyalty. Conclusions: We highlight the pioneering MetaSepsisKnowHub platform, and we show that combining MetaSepsisKnowHub with RAG can minimize limitations on precision and maximize the breadth of LLMs to shorten the bench-to-bedside distance, serving as a knowledge-enhanced paradigm for future application of artificial intelligence in critical care medicine. ", doi="10.2196/67201", url="https://www.jmir.org/2025/1/e67201", url="http://www.ncbi.nlm.nih.gov/pubmed/40478618" } @Article{info:doi/10.2196/69955, author="Will, John and Gupta, Mahin and Zaretsky, Jonah and Dowlath, Aliesha and Testa, Paul and Feldman, Jonah", title="Enhancing the Readability of Online Patient Education Materials Using Large Language Models: Cross-Sectional Study", journal="J Med Internet Res", year="2025", month="Jun", day="4", volume="27", pages="e69955", keywords="patient education", keywords="health literacy", keywords="artificial intelligence", keywords="readability", keywords="health education", abstract="Background: Online accessible patient education materials (PEMs) are essential for patient empowerment. However, studies have shown that these materials often exceed the recommended sixth-grade reading level, making them difficult for many patients to understand. Large language models (LLMs) have the potential to simplify PEMs into more readable educational content. Objective: We sought to evaluate whether 3 LLMs (ChatGPT [OpenAI], Gemini [Google], and Claude [Anthropic PBC]) can optimize the readability of PEMs to the recommended reading level without compromising accuracy. Methods: This cross-sectional study used 60 randomly selected PEMs available online from 3 websites. We prompted LLMs to simplify the reading level of online PEMs. The primary outcome was the readability of the original online PEMs compared with the LLM-simplified versions. Readability scores were calculated using 4 validated indices Flesch Reading Ease, Flesch-Kincaid Grade Level, Gunning Fog Index, and Simple Measure of Gobbledygook Index. Accuracy and understandability were also assessed as balancing measures, with understandability measured using the Patient Education Materials Assessment Tool-Understandability (PEMAT-U). Results: The original readability scores for the American Heart Association (AHA), American Cancer Society (ACS), and American Stroke Association (ASA) websites were above the recommended sixth-grade level, with mean grade level scores of 10.7,10.0, and 9.6, respectively. After optimization by the LLMs, readability scores significantly improved across all 3 websites when compared with the original text. Compared with the original website, Wilcoxon signed rank test showed ChatGPT improved the readability to 7.6 from 10.1 (P<.001); Gemini, to 6.6 (P<.001); and Claude, to 5.6 (P<.001). Word counts were significantly reduced by all LLMs, with a decrease from a mean range of 410.9-953.9 words to a mean range of 201.9-248.1 words. None of the ChatGPT LLM-simplified PEMs were inaccurate, while 3.3\% of Gemini and Claude LLM-simplified PEMs were inaccurate. Baseline understandability scores, as measured by PEMAT-U, were preserved across all LLM-simplified versions. Conclusions: This cross-sectional study demonstrates that LLMs have the potential to significantly enhance the readability of online PEMs while maintaining accuracy and understandability, making them more accessible to a broader audience. However, variability in model performance and demonstrated inaccuracies underscore the need for human review of LLM output. Further study is needed to explore advanced LLM techniques and models trained for medical content. ", doi="10.2196/69955", url="https://www.jmir.org/2025/1/e69955", url="http://www.ncbi.nlm.nih.gov/pubmed/40465378" } @Article{info:doi/10.2196/68138, author="Amirahmadi, Ali and Etminani, Farzaneh and Bj{\"o}rk, Jonas and Melander, Olle and Ohlsson, Mattias", title="Trajectory-Ordered Objectives for Self-Supervised Representation Learning of Temporal Healthcare Data Using Transformers: Model Development and Evaluation Study", journal="JMIR Med Inform", year="2025", month="Jun", day="4", volume="13", pages="e68138", keywords="patient trajectories", keywords="disease prediction", keywords="representation learning", keywords="masked language mode", keywords="deep learning", keywords="BERT", keywords="electronic health record", keywords="language mode", keywords="transformer", keywords="heart failure", keywords="alzheimer disease", keywords="prolonged health of stay", keywords="effectiveness", keywords="temporal", abstract="Background: The growing availability of electronic health records (EHRs) presents an opportunity to enhance patient care by uncovering hidden health risks and improving informed decisions through advanced deep learning methods. However, modeling EHR sequential data, that is, patient trajectories, is challenging due to the evolving relationships between diagnoses and treatments over time. Significant progress has been achieved using transformers and self-supervised learning. While BERT-inspired models using masked language modeling (MLM) capture EHR context, they often struggle with the complex temporal dynamics of disease progression and interventions. Objective: This study aims to improve the modeling of EHR sequences by addressing the limitations of traditional transformer-based approaches in capturing complex temporal dependencies. Methods: We introduce Trajectory Order Objective BERT (Bidirectional Encoder Representations from Transformers; TOO-BERT), a transformer-based model that advances the MLM pretraining approach by integrating a novel TOO to better learn the complex sequential dependencies between medical events. TOO-Bert enhanced the learned context by MLM by pretraining the model to distinguish ordered sequences of medical codes from permuted ones in a patient trajectory. The TOO is enhanced by a conditional selection process that focus on medical codes or visits that frequently occur together, to further improve contextual understanding and strengthen temporal awareness. We evaluate TOO-BERT on 2 extensive EHR datasets, MIMIC-IV hospitalization records and the Malmo Diet and Cancer Cohort (MDC)---comprising approximately 10 and 8 million medical codes, respectively. TOO-BERT is compared against conventional machine learning methods, a transformer trained from scratch, and a transformer pretrained on MLM in predicting heart failure (HF), Alzheimer disease (AD), and prolonged length of stay (PLS). Results: TOO-BERT outperformed conventional machine learning methods and transformer-based approaches in HF, AD, and PLS prediction across both datasets. In the MDC dataset, TOO-BERT improved HF and AD prediction, increasing area under the receiver operating characteristic curve (AUC) scores from 67.7 and 69.5 with the MLM-pretrained Transformer to 73.9 and 71.9, respectively. In the MIMIC-IV dataset, TOO-BERT enhanced HF and PLS prediction, raising AUC scores from 86.2 and 60.2 with the MLM-pretrained Transformer to 89.8 and 60.4, respectively. Notably, TOO-BERT demonstrated strong performance in HF prediction even with limited fine-tuning data, achieving AUC scores of 0.877 and 0.823, compared to 0.839 and 0.799 for the MLM-pretrained Transformer, when fine-tuned on only 50\% (442/884) and 20\% (176/884) of the training data, respectively. Conclusions: These findings demonstrate the effectiveness of integrating temporal ordering objectives into MLM-pretrained models, enabling deeper insights into the complex temporal relationships inherent in EHR data. Attention analysis further highlights TOO-BERT's capability to capture and represent sophisticated structural patterns within patient trajectories, offering a more nuanced understanding of disease progression. ", doi="10.2196/68138", url="https://medinform.jmir.org/2025/1/e68138", url="http://www.ncbi.nlm.nih.gov/pubmed/40465350" } @Article{info:doi/10.2196/66926, author="Cheng, You and Malekar, Mrunal and He, Yingnan and Bommareddy, Apoorva and Magdamo, Colin and Singh, Arjun and Westover, Brandon and Mukerji, S. Shibani and Dickson, John and Das, Sudeshna", title="High-Throughput Phenotyping of the Symptoms of Alzheimer Disease and Related Dementias Using Large Language Models: Cross-Sectional Study", journal="JMIR AI", year="2025", month="Jun", day="3", volume="4", pages="e66926", keywords="electronic health record", keywords="Alzheimer disease and related dementias", keywords="large language model", keywords="disease phenotyping", keywords="symptom extraction", keywords="differential diagnosis", keywords="brain volume", abstract="Background: Alzheimer disease and related dementias (ADRD) are complex disorders with overlapping symptoms and pathologies. Comprehensive records of symptoms in electronic health records (EHRs) are critical for not only reaching an accurate diagnosis but also supporting ongoing research studies and clinical trials. However, these symptoms are frequently obscured within unstructured clinical notes in EHRs, making manual extraction both time-consuming and labor-intensive. Objective: We aimed to automate symptom extraction from the clinical notes of patients with ADRD using fine-tuned large language models (LLMs), compare its performance to regular expression-based symptom recognition, and validate the results using brain magnetic resonance imaging (MRI) data. Methods: We fine-tuned LLMs to extract ADRD symptoms across the following 7 domains: memory, executive function, motor, language, visuospatial, neuropsychiatric, and sleep. We assessed the algorithm's performance by calculating the area under the receiver operating characteristic curve (AUROC) for each domain. The extracted symptoms were then validated in two analyses: (1) predicting ADRD diagnosis using the counts of extracted symptoms and (2) examining the association between ADRD symptoms and MRI-derived brain volumes. Results: Symptom extraction across the 7 domains achieved high accuracy with AUROCs ranging from 0.97 to 0.99. Using the counts of extracted symptoms to predict ADRD diagnosis yielded an AUROC of 0.83 (95\% CI 0.77-0.89). Symptom associations with brain volumes revealed that a smaller hippocampal volume was linked to memory impairments (odds ratio 0.62, 95\% CI 0.46-0.84; P=.006), and reduced pallidum size was associated with motor impairments (odds ratio 0.73, 95\% CI 0.58-0.90; P=.04). Conclusions: These results highlight the accuracy and reliability of our high-throughput ADRD phenotyping algorithm. By enabling automated symptom extraction, our approach has the potential to assist with differential diagnosis, as well as facilitate clinical trials and research studies of dementia. ", doi="10.2196/66926", url="https://ai.jmir.org/2025/1/e66926" } @Article{info:doi/10.2196/67369, author="Shewcraft, Allen Ryan and Schwarz, John and Micsinai Balan, Mariann", title="Algorithmic Classification of Psychiatric Disorder--Related Spontaneous Communication Using Large Language Model Embeddings: Algorithm Development and Validation", journal="JMIR AI", year="2025", month="May", day="30", volume="4", pages="e67369", keywords="psychiatric disorders", keywords="large language models", keywords="speech", keywords="language", keywords="spontaneous communication", keywords="social media", keywords="LLM", keywords="communication", keywords="algorithm", keywords="emotion", keywords="schizophrenia", keywords="borderline personality disorder", keywords="BPD", keywords="depression", keywords="attention-deficit/hyperactivity disorder", keywords="ADHD", keywords="anxiety", keywords="posttraumatic stress disorder", keywords="PTSD", keywords="bipolar disorder", keywords="assessment", keywords="monitoring", abstract="Background: Language, which is a crucial element of human communication, is influenced by the complex interplay between thoughts, emotions, and experiences. Psychiatric disorders have an impact on cognitive and emotional processes, which in turn affect the content and way individuals with these disorders communicate using language. The recent rapid advancements in large language models (LLMs) suggest that leveraging them for quantitative analysis of language usage has the potential to become a useful method for providing objective measures in diagnosing and monitoring psychiatric conditions by analyzing language patterns. Objective: This study aims to explore the use of LLMs in analyzing spontaneous communication to differentiate between various psychiatric disorders. We seek to show that the latent LLM embedding space identifies distinct linguistic markers that can be used to classify spontaneous communication from 7 different psychiatric disorders. Methods: We used embeddings from the 7 billion parameter Generative Representational Instruction Tuning Language Model to analyze more than 37,000 posts from subreddits dedicated to seven common conditions: schizophrenia, borderline personality disorder (BPD), depression, attention-deficit/hyperactivity disorder (ADHD), anxiety, posttraumatic stress disorder (PTSD) and bipolar disorder. A cross-validated multiclass Extreme Gradient Boosting classifier was trained on these embeddings to predict the origin subreddit for each post. Performance was evaluated using metrics such as precision, recall, F1-score, and area under the receiver operating characteristic curve (AUC). In addition, we used Uniform Manifold Approximation and Projection dimensionality reduction to visualize relationships in language between these psychiatric disorders. Results: The 10-fold cross-validated Extreme Gradient Boosting classifier achieved a support-weighted average precision, recall, F1, and accuracy score of 0.73, 0.73, 0.73, and 0.73, respectively. In one-versus-rest tasks, individual category AUCs ranged from 0.89 to 0.97, with a microaverage AUC of 0.95. ADHD posts were classified with the highest AUC of 0.97, indicating distinct linguistic features, while BPD posts had the lowest AUC of 0.89, suggesting greater linguistic overlap with other conditions. Consistent with the classifier results, the ADHD posts have a more visually distinct cluster in the Uniform Manifold Approximation and Projection projects, while BPD overlaps with depression, anxiety, and schizophrenia. Comparisons with other state-of-the-art embedding methods, such as OpenAI's text-embedding-3-small (AUC=0.94) and sentence-bidirectional encoder representations from transformers (AUC=0.86), demonstrated superior performance of the Generative Representational Instruction Tuning Language Model-7B model. Conclusions: This study introduces an innovative use of LLMs in psychiatry, showcasing their potential to objectively examine language use for distinguishing between different psychiatric disorders. The findings highlight the capability of LLMs to offer valuable insights into the linguistic patterns unique to various conditions, paving the way for more efficient, patient-focused diagnostic and monitoring strategies. Future research should aim to validate these results with clinically confirmed populations and investigate the implications of comorbidity and spectrum disorders. ", doi="10.2196/67369", url="https://ai.jmir.org/2025/1/e67369" } @Article{info:doi/10.2196/69534, author="Parks, Acacia and Travers, Eoin and Perera-Delcourt, Ramesh and Major, Max and Economides, Marcos and Mullan, Phil", title="Is This Chatbot Safe and Evidence-Based? A Call for the Critical Evaluation of Generative AI Mental Health Chatbots", journal="J Particip Med", year="2025", month="May", day="29", volume="17", pages="e69534", keywords="GenAI", keywords="mental health", keywords="chatbot", keywords="ethics", keywords="evals", doi="10.2196/69534", url="https://jopm.jmir.org/2025/1/e69534" } @Article{info:doi/10.2196/70339, author="Hose, Bat-Zion and Rounds, K. Amanda and Nandwani, Ishaan and Busog, Deanna-Nicole and Giardina, Davis Traber and Haskell, Helen and Smith, M. Kelly and Miller, E. Kristen", title="Use of ChatGPT for Urinary Symptom Management Among People With Spinal Cord Injury or Disease: Qualitative Study", journal="JMIR Rehabil Assist Technol", year="2025", month="May", day="29", volume="12", pages="e70339", keywords="ChatGPT", keywords="urinary symptom management", keywords="spinal cord injury", keywords="trust in artificial intelligence", abstract="Background: Individuals with spinal cord injury or disease (SCI/D) experience disproportionately high rates of recurrent urinary tract infections, which are often complicated by atypical symptoms and delayed diagnoses. Patient-centered tools, like the Urinary Symptom Questionnaires for Neurogenic Bladder (USQNB), have been developed to support symptom assessment yet remain underused. Generative artificial intelligence tools such as ChatGPT may offer a more usable approach to improving symptom management by providing real-time, tailored health information directly to patients. Objective: This study explores the role of ChatGPT (version 3.5) in supporting urinary symptom management for individuals with SCI/D, focusing on its perceived accuracy, usefulness, and impact on health care engagement and self-management practices. Methods: A total of 30 individuals with SCI/D were recruited through advocacy groups and health care networks. Using realistic, scenario-based testing derived from validated tools for symptom management with SCI/D, such as the USQNB, participants interacted with ChatGPT to seek advice for urinary symptoms. Follow-up interviews were conducted remotely to assess individuals' experiences using ChatGPT for urinary symptom management. Data were analyzed using inductive content analysis, with themes refined iteratively through a consensus-based process. Results: People with SCI/D reported high levels of trust in ChatGPT's recommendations, with all 30 participants agreeing or strongly agreeing with the advice provided. ChatGPT's responses were perceived as clear and comparable to professional medical advice. Participants mentioned concerns about the lack of sources and integration with patient-specific data. ChatGPT influenced individuals' decision-making by supporting symptom assessment and guiding participants on when to seek professional care or pursue self-management strategies. Conclusions: ChatGPT is a promising tool for symptom assessment and managing chronic conditions such as urinary symptoms in individuals with SCI/D. While ChatGPT enhances accessibility to health information, further research is needed to improve its transparency and integration with personalized health data to be a more usable tool in making informed health decisions. ", doi="10.2196/70339", url="https://rehab.jmir.org/2025/1/e70339" } @Article{info:doi/10.2196/66429, author="Liu, Hui and Peng, Jialun and Li, Lu and Deng, Ao and Huang, XiangXin and Yin, Guobing and Luo, Haojun", title="Large Language Models as a Consulting Hotline for Patients With Breast Cancer and Specialists in China: Cross-Sectional Questionnaire Study", journal="JMIR Med Inform", year="2025", month="May", day="27", volume="13", pages="e66429", keywords="large language models", keywords="breast cancer", keywords="health education", keywords="cross-sectional study", abstract="Background: The disease burden of breast cancer is increasing in China. Guiding people to obtain accurate information on breast cancer and improving the public's health literacy are crucial for the early detection and timely treatment of breast cancer. Large language model (LLM) is a currently popular source of health information. However, the accuracy and practicality of the breast cancer--related information provided by LLMs have not yet been evaluated. Objective: This study aims to evaluate and compare the accuracy, practicality, and generalization-specificity of responses to breast cancer--related questions from two LLMs, ChatGPT and ERNIE Bot (EB). Methods: The questions asked to the LLMs consisted of a patient questionnaire and an expert questionnaire, each containing 15 questions. ChatGPT was queried in both Chinese and English, recorded as ChatGPT-Chinese (ChatGPT-C) and ChatGPT-English (ChatGPT-E) respectively, while EB was queried in Chinese. The accuracy, practicality, and generalization-specificity of each inquiry's responses were rated by a breast cancer multidisciplinary treatment team using Likert scales. Results: Overall, for both the patient and expert questionnaire, the accuracy and practicality of responses from ChatGPT-E were significantly higher than those from ChatGPT-C and EB (all Ps<.001). However, the responses from all LLMs are relatively generalized, leading to lower accuracy and practicality for the expert questionnaire compared to the patient questionnaire. Additionally, there were issues such as the lack of supporting evidence and potential ethical risks in the responses of LLMs. Conclusions: Currently, compared to other LLMs, ChatGPT-E has demonstrated greater potential for application in educating Chinese patients with breast cancer, and may serve as an effective tool for them to obtain health information. However, for breast cancer specialists, these LLMs are not yet suitable for assisting in clinical diagnosis or treatment activities. Additionally, data security, ethical, and legal risks associated with using LLMs in clinical practice cannot be ignored. In the future, further research is needed to determine the true efficacy of LLMs in clinical scenarios related to breast cancer in China. ", doi="10.2196/66429", url="https://medinform.jmir.org/2025/1/e66429" } @Article{info:doi/10.2196/66917, author="Omar, Mahmud and Agbareia, Reem and Glicksberg, S. Benjamin and Nadkarni, N. Girish and Klang, Eyal", title="Benchmarking the Confidence of Large Language Models in Answering Clinical Questions: Cross-Sectional Evaluation Study", journal="JMIR Med Inform", year="2025", month="May", day="16", volume="13", pages="e66917", keywords="safe AI", keywords="artificial intelligence", keywords="AI", keywords="algorithm", keywords="large language model", keywords="LLM", keywords="natural language processing", keywords="NLP", keywords="deep learning", abstract="Background: The capabilities of large language models (LLMs) to self-assess their own confidence in answering questions within the biomedical realm remain underexplored. Objective: This study evaluates the confidence levels of 12 LLMs across 5 medical specialties to assess LLMs' ability to accurately judge their own responses. Methods: We used 1965 multiple-choice questions that assessed clinical knowledge in the following areas: internal medicine, obstetrics and gynecology, psychiatry, pediatrics, and general surgery. Models were prompted to provide answers and to also provide their confidence for the correct answers (score: range 0\%?100\%). We calculated the correlation between each model's mean confidence score for correct answers and the overall accuracy of each model across all questions. The confidence scores for correct and incorrect answers were also analyzed to determine the mean difference in confidence, using 2-sample, 2-tailed t tests. Results: The correlation between the mean confidence scores for correct answers and model accuracy was inverse and statistically significant (r=?0.40; P=.001), indicating that worse-performing models exhibited paradoxically higher confidence. For instance, a top-performing model---GPT-4o---had a mean accuracy of 74\% (SD 9.4\%), with a mean confidence of 63\% (SD 8.3\%), whereas a low-performing model---Qwen2-7B---showed a mean accuracy of 46\% (SD 10.5\%) but a mean confidence of 76\% (SD 11.7\%). The mean difference in confidence between correct and incorrect responses was low for all models, ranging from 0.6\% to 5.4\%, with GPT-4o having the highest mean difference (5.4\%, SD 2.3\%; P=.003). Conclusions: Better-performing LLMs show more aligned overall confidence levels. However, even the most accurate models still show minimal variation in confidence between right and wrong answers. This may limit their safe use in clinical settings. Addressing overconfidence could involve refining calibration methods, performing domain-specific fine-tuning, and involving human oversight when decisions carry high risks. Further research is needed to improve these strategies before broader clinical adoption of LLMs. ", doi="10.2196/66917", url="https://medinform.jmir.org/2025/1/e66917" } @Article{info:doi/10.2196/71613, author="Wang, Chenxu and Wang, Fei and Li, Shuhan and Ren, Qing-wen and Tan, Xiaomei and Fu, Yaoyu and Liu, Di and Qian, Guangwu and Cao, Yu and Yin, Rong and Li, Kang", title="Patient Triage and Guidance in Emergency Departments Using Large Language Models: Multimetric Study", journal="J Med Internet Res", year="2025", month="May", day="15", volume="27", pages="e71613", keywords="ChatGPT", keywords="artificial intelligence", keywords="patient triage", keywords="health care", keywords="prompt engineering", keywords="large language models", keywords="Modified Early Warning Score", abstract="Background: Emergency departments (EDs) face significant challenges due to overcrowding, prolonged waiting times, and staff shortages, leading to increased strain on health care systems. Efficient triage systems and accurate departmental guidance are critical for alleviating these pressures. Recent advancements in large language models (LLMs), such as ChatGPT, offer potential solutions for improving patient triage and outpatient department selection in emergency settings. Objective: The study aimed to assess the accuracy, consistency, and feasibility of GPT-4--based ChatGPT models (GPT-4o and GPT-4-Turbo) for patient triage using the Modified Early Warning Score (MEWS) and evaluate GPT-4o's ability to provide accurate outpatient department guidance based on simulated patient scenarios. Methods: A 2-phase experimental study was conducted. In the first phase, 2 ChatGPT models (GPT-4o and GPT-4-Turbo) were evaluated for MEWS-based patient triage accuracy using 1854 simulated patient scenarios. Accuracy and consistency were assessed before and after prompt engineering. In the second phase, GPT-4o was tested for outpatient department selection accuracy using 264 scenarios sourced from the Chinese Medical Case Repository. Each scenario was independently evaluated by GPT-4o thrice. Data analyses included Wilcoxon tests, Kendall correlation coefficients, and logistic regression analyses. Results: In the first phase, ChatGPT's triage accuracy, based on MEWS, improved following prompt engineering. Interestingly, GPT-4-Turbo outperformed GPT-4o. GPT-4-Turbo achieved an accuracy of 100\% compared to GPT-4o's accuracy of 96.2\%, despite GPT-4o initially showing better performance prior to prompt engineering. This finding suggests that GPT-4-Turbo may be more adaptable to prompt optimization. In the second phase, GPT-4o, with superior performance on emotional responsiveness compared to GPT-4-Turbo, demonstrated an overall guidance accuracy of 92.63\% (95\% CI 90.34\%-94.93\%), with the highest accuracy in internal medicine (93.51\%, 95\% CI 90.85\%-96.17\%) and the lowest in general surgery (91.46\%, 95\% CI 86.50\%-96.43\%). Conclusions: ChatGPT demonstrated promising capability for supporting patient triage and outpatient guidance in EDs. GPT-4-Turbo showed greater adaptability to prompt engineering, whereas GPT-4o exhibited superior responsiveness and emotional interaction, which are essential for patient-facing tasks. Future studies should explore real-world implementation and address the identified limitations to enhance ChatGPT's clinical integration. ", doi="10.2196/71613", url="https://www.jmir.org/2025/1/e71613" } @Article{info:doi/10.2196/69004, author="Adams, CB Meredith and Perkins, L. Matthew and Hudson, Cody and Madhira, Vithal and Akbilgic, Oguz and Ma, Da and Hurley, W. Robert and Topaloglu, Umit", title="Breaking Digital Health Barriers Through a Large Language Model--Based Tool for Automated Observational Medical Outcomes Partnership Mapping: Development and Validation Study", journal="J Med Internet Res", year="2025", month="May", day="15", volume="27", pages="e69004", keywords="large language model", keywords="artificial intelligence", keywords="common data model", keywords="data harmonization", keywords="clinical trials", keywords="electronic health record", keywords="registry data", abstract="Background: The integration of diverse clinical data sources requires standardization through models such as Observational Medical Outcomes Partnership (OMOP). However, mapping data elements to OMOP concepts demands significant technical expertise and time. While large health care systems often have resources for OMOP conversion, smaller clinical trials and studies frequently lack such support, leaving valuable research data siloed. Objective: This study aims to develop and validate a user-friendly tool that leverages large language models to automate the OMOP conversion process for clinical trials, electronic health records, and registry data. Methods: We developed a 3-tiered semantic matching system using GPT-3 embeddings to transform heterogeneous clinical data to the OMOP Common Data Model. The system processes input terms by generating vector embeddings, computing cosine similarity against precomputed Observational Health Data Sciences and Informatics vocabulary embeddings, and ranking potential matches. We validated the system using two independent datasets: (1) a development set of 76 National Institutes of Health Helping to End Addiction Long-term Initiative clinical trial common data elements for chronic pain and opioid use disorders and (2) a separate validation set of electronic health record concepts from the National Institutes of Health National COVID Cohort Collaborative COVID-19 enclave. The architecture combines Unified Medical Language System semantic frameworks with asynchronous processing for efficient concept mapping, made available through an open-source implementation. Results: The system achieved an area under the receiver operating characteristic curve of 0.9975 for mapping clinical trial common data element terms. Precision ranged from 0.92 to 0.99 and recall ranged from 0.88 to 0.97 across similarity thresholds from 0.85 to 1.0. In practical application, the tool successfully automated mappings that previously required manual informatics expertise, reducing the technical barriers for research teams to participate in large-scale, data-sharing initiatives. Representative mappings demonstrated high accuracy, such as demographic terms achieving 100\% similarity with corresponding Logical Observation Identifiers Names and Codes concepts. The implementation successfully processes diverse data types through both individual term mapping and batch processing capabilities. Conclusions: Our validated large language model--based tool effectively automates the transformation of clinical data into the OMOP format while maintaining high accuracy. The combination of semantic matching capabilities and a researcher-friendly interface makes data harmonization accessible to smaller research teams without requiring extensive informatics support. This has direct implications for accelerating clinical research data standardization and enabling broader participation in initiatives such as the National Institutes of Health Helping to End Addiction Long-term Initiative Data Ecosystem. ", doi="10.2196/69004", url="https://www.jmir.org/2025/1/e69004", url="http://www.ncbi.nlm.nih.gov/pubmed/40146872" } @Article{info:doi/10.2196/70096, author="Zhang, Junyan and Zhou, Junchen and Zhou, Liqin and Ba, Zhichao", title="Extracting Multifaceted Characteristics of Patients With Chronic Disease Comorbidity: Framework Development Using Large Language Models", journal="JMIR Med Inform", year="2025", month="May", day="15", volume="13", pages="e70096", keywords="large language model", keywords="zero-shot prompting", keywords="information extraction", keywords="chronic disease", keywords="multimorbidity", keywords="natural language processing", abstract="Background: Research on chronic multimorbidity has increasingly become a focal point with the aging of the population. Many studies in this area require detailed patient characteristic information. However, the current methods for extracting such information are complex, time-consuming, and prone to errors. The challenge of quickly and accurately extracting patient characteristics has become a common issue in the study of chronic disease comorbidities. Objective: Our objective was to establish a comprehensive framework for extracting demographic and disease characteristics of patients with multimorbidity. This framework leverages large language models (LLMs) to extract feature information from unstructured and semistructured electronic health records pertaining to these patients. We investigated the model's proficiency in extracting feature information across 7 dimensions: basic information, disease details, lifestyle habits, family medical history, symptom history, medication recommendations, and dietary advice. In addition, we demonstrated the strengths and limitations of this framework. Methods: We used data sourced from a grassroots community health service center in China. We developed a multifaceted feature extraction framework tailored for patients with multimorbidity, which consists of several integral components: feasibility testing, preprocessing, the determination of feature extraction, prompt modeling based on LLMs, postprocessing, and midterm evaluation. Within this framework, 7 types of feature information were extracted as straightforward features, and three types of features were identified as intricate features. On the basis of the straightforward features, we calculated patients' age, BMI, and 12 disease risk factors. Rigorous manual verification experiments were conducted 100 times for straightforward features and 200 times for intricate features, followed by comprehensive quantitative and qualitative assessments of the experimental outcomes. Results: The framework achieved an overall F1-score of 99.6\% for the 7 straightforward feature extractions, with the highest F1-score of 100\% for basic information. In addition, the framework demonstrated an overall F1-score of 94.4\% for the 3 intricate feature extractions. Our analysis of the results revealed that accurate information content extraction is a substantially advantage of this framework, whereas ensuring consistency in the format of extracted information remains one of its challenges. Conclusions: The framework incorporates electronic health record information from 1225 patients with multimorbidity, covering a diverse range of 41 chronic diseases, and can seamlessly accommodate the inclusion of additional diseases. This underscores its scalability and adaptability as a method for extracting patient-specific characteristics, effectively addressing the challenges associated with information extraction in the context of multidisease research. Research and medical policy personnel can extract feature information by setting corresponding goals based on the research objectives and directly using the LLM for zero-sample target feature extraction. This approach greatly improves research efficiency and reduces labor requirements; moreover, due to the framework's high accuracy, it can increase study reliability. ", doi="10.2196/70096", url="https://medinform.jmir.org/2025/1/e70096" } @Article{info:doi/10.2196/66161, author="Li, Yilan and Gu, Tianshu and Yang, Chengyuan and Li, Minghui and Wang, Congyi and Yao, Lan and Gu, Weikuan and Sun, DianJun", title="AI-Assisted Hypothesis Generation to Address Challenges in Cardiotoxicity Research: Simulation Study Using ChatGPT With GPT-4o", journal="J Med Internet Res", year="2025", month="May", day="15", volume="27", pages="e66161", keywords="cardiotoxicity", keywords="ChatGPT with GPT-4o", keywords="artificial intelligence", keywords="AI", keywords="heart", keywords="hypothesis generation", abstract="Background: Cardiotoxicity is a major concern in heart disease research because it can lead to severe cardiac damage, including heart failure and arrhythmias. Objective: This study aimed to explore the ability of ChatGPT with GPT-4o to generate innovative research hypotheses to address 5 major challenges in cardiotoxicity research: the complexity of mechanisms, variability among patients, the lack of detection sensitivity, the lack of reliable biomarkers, and the limitations of animal models. Methods: ChatGPT with GPT-4o was used to generate multiple hypotheses for each of the 5 challenges. These hypotheses were then independently evaluated by 3 experts for novelty and feasibility. ChatGPT with GPT-4o subsequently selected the most promising hypothesis from each category and provided detailed experimental plans, including background, rationale, experimental design, expected outcomes, potential pitfalls, and alternative approaches. Results: ChatGPT with GPT-4o generated 96 hypotheses, of which 13 (14\%) were rated as highly novel and 62 (65\%) as moderately novel. The average group score of 3.85 indicated a strong level of innovation in these hypotheses. Literature searching identified at least 1 relevant publication for 28 (29\%) of the 96 hypotheses. The selected hypotheses included using single-cell RNA sequencing to understand cellular heterogeneity, integrating artificial intelligence with genetic profiles for personalized cardiotoxicity risk prediction, applying machine learning to electrocardiogram data for enhanced detection sensitivity, using multi-omics approaches for biomarker discovery, and developing 3D bioprinted heart tissues to overcome the limitations of animal models. Our group's evaluation of the 30 dimensions of the experimental plans for the 5 hypotheses selected by ChatGPT with GPT-4o revealed consistent strengths in the background, rationale, and alternative approaches, with most of the hypotheses (20/30, 67\%) receiving scores of ?4 in these areas. While the hypotheses were generally well received, the experimental designs were often deemed overly ambitious, highlighting the need for more practical considerations. Conclusions: Our study demonstrates that ChatGPT with GPT-4o can generate innovative and potentially impactful hypotheses for overcoming critical challenges in cardiotoxicity research. These findings suggest that artificial intelligence--assisted hypothesis generation could play a crucial role in advancing the field of cardiotoxicity, leading to more accurate predictions, earlier detection, and better patient outcomes. ", doi="10.2196/66161", url="https://www.jmir.org/2025/1/e66161" } @Article{info:doi/10.2196/69639, author="Feng, Yi and Hang, Yaming and Wu, Wenzhi and Song, Xiaohang and Xiao, Xiyao and Dong, Fangbai and Qiao, Zhihong", title="Effectiveness of AI-Driven Conversational Agents in Improving Mental Health Among Young People: Systematic Review and Meta-Analysis", journal="J Med Internet Res", year="2025", month="May", day="14", volume="27", pages="e69639", keywords="artificial intelligence", keywords="conversational agents", keywords="meta-analysis", keywords="mental health intervention", keywords="young people", abstract="Background: The increasing prevalence of mental health issues among adolescents and young adults, coupled with barriers to accessing traditional therapy, has led to growing interest in artificial intelligence (AI)-driven conversational agents (CAs) as a novel digital mental health intervention. Despite accumulating evidence suggesting the effectiveness of AI-driven CAs for mental health, there is still limited evidence on their effectiveness for different mental health conditions in adolescents and young adults. Objective: This study aims to examine the effectiveness of AI-driven CAs for mental health among young people, and explore the potential moderators of efficacy. Methods: A total of 5 main databases (PubMed, PsycINFO, Embase, Cochrane Library, and Web of Science) were searched systematically dated from the establishment of the database to August 6, 2024. Randomized controlled trials comparing AI-driven CAs with any other type of control condition in improving depressive symptoms, generalized anxiety symptoms, stress, mental well-being, and positive and negative affect were considered eligible when they were conducted in young people aged 12-25 years. The quality of these studies was assessed using the Cochrane Risk of Bias tool. Data were extracted by 2 independent reviewers and checked by a third reviewer. Pooled effect sizes (Hedges g) were calculated using random effect models and visually presented in forest plots. Results: A total of 14 articles (including 15 trials) were included, involving 1974 participants. The results indicated that, after adjustment for publication bias, AI-driven CAs had a moderate-to-large (Hedges g=0.61, 95\% CI 0.35-0.86) effect on depressive symptoms compared to control conditions. However, their effect sizes adjusting for publication bias for generalized anxiety symptoms (Hedges g=0.06, 95\% CI --0.21 to 0.32), stress (Hedges g=0.002, 95\% CI --0.19 to 0.20), positive affect (Hedges g=0.01, 95\% CI --0.24 to 0.27), negative affect (Hedges g=0.07, 95\% CI --0.13 to 0.27), and mental well-being (Hedges g=0.04, 95\% CI --0.21 to 0.29) were all nonsignificant. Subgroup analyses revealed that AI-driven CAs were particularly effective in improving depressive symptoms among subclinical populations (Hedges g=0.74, 95\% CI 0.50-0.98). Conclusions: The findings highlight the potential of AI-driven CAs for early intervention in depression among this population, and underscore the need for further improvements to enhance their efficacy across a broader range of mental health outcomes. Key limitations of the reviewed evidence include heterogeneity in therapeutic orientations of CAs and lack of follow-up measures. Future research should explore the long-term effects of AI-driven CAs on mental health outcomes. ", doi="10.2196/69639", url="https://www.jmir.org/2025/1/e69639" } @Article{info:doi/10.2196/70122, author="Yue, Yongjie and Liu, Dong and Lv, Yilin and Hao, Junyi and Cui, Peixuan", title="A Practical Guide and Assessment on Using ChatGPT to Conduct Grounded Theory: Tutorial", journal="J Med Internet Res", year="2025", month="May", day="14", volume="27", pages="e70122", keywords="grounded theory", keywords="ChatGPT", keywords="manual coding", keywords="computer-assisted software", keywords="performance", keywords="human-AI collaboration", doi="10.2196/70122", url="https://www.jmir.org/2025/1/e70122" } @Article{info:doi/10.2196/66796, author="Suppan, M{\'e}lanie and Fubini, Elias Pietro and Stefani, Alexandra and Gisselbaek, Mia and Samer, Flora Caroline and Savoldelli, Louis Georges", title="Performance of 3 Conversational Generative Artificial Intelligence Models for Computing Maximum Safe Doses of Local Anesthetics: Comparative Analysis", journal="JMIR AI", year="2025", month="May", day="13", volume="4", pages="e66796", keywords="local anesthetic", keywords="dose calculation", keywords="toxicity", keywords="performance", keywords="conversational generative artificial intelligence", keywords="artificial intelligence", keywords="anesthesiology", keywords="comparative analysis", keywords="anesthetics", keywords="LA", keywords="generative artificial intelligence", keywords="ChatGPT", keywords="Copilot", keywords="Gemini", keywords="artificial intelligence models", keywords="machine learning", keywords="neural network", keywords="LLM", keywords="NLP", keywords="natural language processing", keywords="large language model", keywords="AI", keywords="ML", abstract="Background: Generative artificial intelligence (AI) is showing great promise as a tool to optimize decision-making across various fields, including medicine. In anesthesiology, accurately calculating maximum safe doses of local anesthetics (LAs) is crucial to prevent complications such as local anesthetic systemic toxicity (LAST). Current methods for determining LA dosage are largely based on empirical guidelines and clinician experience, which can result in significant variability and dosing errors. AI models may offer a solution, by processing multiple parameters simultaneously to suggest adequate LA doses. Objective: This study aimed to evaluate the efficacy and safety of 3 generative AI models, ChatGPT (OpenAI), Copilot (Microsoft Corporation), and Gemini (Google LLC), in calculating maximum safe LA doses, with the goal of determining their potential use in clinical practice. Methods: A comparative analysis was conducted using a 51-item questionnaire designed to assess LA dose calculation across 10 simulated clinical vignettes. The responses generated by ChatGPT, Copilot, and Gemini were compared with reference doses calculated using a scientifically validated set of rules. Quantitative evaluations involved comparing AI-generated doses to these reference doses, while qualitative assessments were conducted by independent reviewers using a 5-point Likert scale. Results: All 3 AI models (Gemini, ChatGPT, and Copilot) completed the questionnaire and generated responses aligned with LA dose calculation principles, but their performance in providing safe doses varied significantly. Gemini frequently avoided proposing any specific dose, instead recommending consultation with a specialist. When it did provide dose ranges, they often exceeded safe limits by 140\% (SD 103\%) in cases involving mixtures. ChatGPT provided unsafe doses in 90\% (9/10) of cases, exceeding safe limits by 198\% (SD 196\%). Copilot's recommendations were unsafe in 67\% (6/9) of cases, exceeding limits by 217\% (SD 239\%). Qualitative assessments rated Gemini as ``fair'' and both ChatGPT and Copilot as ``poor.'' Conclusions: Generative AI models like Gemini, ChatGPT, and Copilot currently lack the accuracy and reliability needed for safe LA dose calculation. Their poor performance suggests that they should not be used as decision-making tools for this purpose. Until more reliable AI-driven solutions are developed and validated, clinicians should rely on their expertise, experience, and a careful assessment of individual patient factors to guide LA dosing and ensure patient safety. ", doi="10.2196/66796", url="https://ai.jmir.org/2025/1/e66796" } @Article{info:doi/10.2196/66552, author="Rodrigues Alessi, Mateus and Gomes, Augusto Heitor and Oliveira, Gabriel and Lopes de Castro, Matheus and Grenteski, Fabiano and Miyashiro, Leticia and do Valle, Camila and Tozzini Tavares da Silva, Leticia and Okamoto, Cristina", title="Comparative Performance of Medical Students, ChatGPT-3.5 and ChatGPT-4.0 in Answering Questions From a Brazilian National Medical Exam: Cross-Sectional Questionnaire Study", journal="JMIR AI", year="2025", month="May", day="8", volume="4", pages="e66552", keywords="artificial intelligence", keywords="intelligent systems", keywords="biomedical technology", keywords="medical ethics", keywords="exam questions", keywords="academic performance", keywords="AI", keywords="ethics", keywords="medical education", keywords="ChatGPT", keywords="medical exam", keywords="accuracy", keywords="medical student", keywords="observational study", keywords="medical data", keywords="medical school", abstract="Background: Artificial intelligence has advanced significantly in various fields, including medicine, where tools like ChatGPT (GPT) have demonstrated remarkable capabilities in interpreting and synthesizing complex medical data. Since its launch in 2019, GPT has evolved, with version 4.0 offering enhanced processing power, image interpretation, and more accurate responses. In medicine, GPT has been used for diagnosis, research, and education, achieving significant milestones like passing the United States Medical Licensing Examination. Recent studies show that GPT 4.0 outperforms earlier versions and even medical students on medical exams. Objective: This study aimed to evaluate and compare the performance of GPT versions 3.5 and 4.0 on Brazilian Progress Tests (PT) from 2021 to 2023, analyzing their accuracy compared to medical students. Methods: A cross-sectional observational study was conducted using 333 multiple-choice questions from the PT, excluding questions with images and those nullified or repeated. All questions were presented sequentially without modification to their structure. The performance of GPT versions was compared using statistical methods and medical students' scores were included for context. Results: There was a statistically significant difference in total performance scores across the 2021, 2022, and 2023 exams between GPT-3.5 and GPT-4.0 (P=.03). However, this significance did not remain after Bonferroni correction. On average, GPT v3.5 scored 68.4\%, whereas v4.0 achieved 87.2\%, reflecting an absolute improvement of 18.8\% and a relative increase of 27.4\% in accuracy. When broken down by subject, the average scores for GPT-3.5 and GPT-4.0, respectively, were as follows: surgery (73.5\% vs 88.0\%, P=.03), basic sciences (77.5\% vs 96.2\%, P=.004), internal medicine (61.5\% vs 75.1\%, P=.14), gynecology and obstetrics (64.5\% vs 94.8\%, P=.002), pediatrics (58.5\% vs 80.0\%, P=.02), and public health (77.8\% vs 89.6\%, P=.02). After Bonferroni correction, only basic sciences and gynecology and obstetrics retained statistically significant differences. Conclusions: GPT-4.0 demonstrates superior accuracy compared to its predecessor in answering medical questions on the PT. These results are similar to other studies, indicating that we are approaching a new revolution in medicine. ", doi="10.2196/66552", url="https://ai.jmir.org/2025/1/e66552" } @Article{info:doi/10.2196/67383, author="Sumner, Jennifer and Wang, Yuchen and Tan, Ying Si and Chew, Hoon Emily Hwee and Wenjun Yip, Alexander", title="Perspectives and Experiences With Large Language Models in Health Care: Survey Study", journal="J Med Internet Res", year="2025", month="May", day="1", volume="27", pages="e67383", keywords="digital health", keywords="artificial intelligence", keywords="survey research", keywords="large language model", keywords="healthcare", keywords="survey", keywords="workforce", keywords="healthcare worker", keywords="professional", abstract="Background: Large language models (LLMs) are transforming how data is used, including within the health care sector. However, frameworks including the Unified Theory of Acceptance and Use of Technology highlight the importance of understanding the factors that influence technology use for successful implementation. Objective: This study aimed to (1) investigate users' uptake, perceptions, and experiences regarding LLMs in health care and (2) contextualize survey responses by demographics and professional profiles. Methods: An electronic survey was administered to elicit stakeholder perspectives of LLMs (health care providers and support functions), their experiences with LLMs, and their potential impact on functional roles. Survey domains included: demographics (6 questions), user experiences of LLMs (8 questions), motivations for using LLMs (6 questions), and perceived impact on functional roles (4 questions). The survey was launched electronically, targeting health care providers or support staff, health care students, and academics in health-related fields. Respondents were adults (>18 years) aware of LLMs. Results: Responses were received from 1083 individuals, of which 845 were analyzable. Of the 845 respondents, 221 had yet to use an LLM. Nonusers were more likely to be health care workers (P<.001), older (P<.001), and female (P<.01). Users primarily adopted LLMs for speed, convenience, and productivity. While 75\% (470/624) agreed that the user experience was positive, 46\% (294/624) found the generated content unhelpful. Regression analysis showed that the experience with LLMs is more likely to be positive if the user is male (odds ratio [OR] 1.62, CI 1.06-2.48), and increasing age was associated with a reduced likelihood of reporting LLM output as useful (OR 0.98, CI 0.96-0.99). Nonusers compared to LLM users were less likely to report LLMs meeting unmet needs (45\%, 99/221 vs 65\%, 407/624; OR 0.48, CI 0.35-0.65), and males were more likely to report that LLMs do address unmet needs (OR 1.64, CI 1.18-2.28). Furthermore, nonusers compared to LLM users were less likely to agree that LLMs will improve functional roles (63\%, 140/221 vs 75\%, 469/624; OR 0.60, CI 0.43-0.85). Free-text opinions highlighted concerns regarding autonomy, outperformance, and reduced demand for care. Respondents also predicted changes to human interactions, including fewer but higher quality interactions and a change in consumer needs as LLMs become more common, which would require provider adaptation. Conclusions: Despite the reported benefits of LLMs, nonusers---primarily health care workers, older individuals, and females---appeared more hesitant to adopt these tools. These findings underscore the need for targeted education and support to address adoption barriers and ensure the successful integration of LLMs in health care. Anticipated role changes, evolving human interactions, and the risk of the digital divide further emphasize the need for careful implementation and ongoing evaluation of LLMs in health care to ensure equity and sustainability. ", doi="10.2196/67383", url="https://www.jmir.org/2025/1/e67383", url="http://www.ncbi.nlm.nih.gov/pubmed/40310666" } @Article{info:doi/10.2196/69428, author="Esmail, Shaniff and Concannon, Brendan", title="Immersive Virtual Reality and AI (Generative Pretrained Transformer) to Enhance Student Preparedness for Objective Structured Clinical Examinations: Mixed Methods Study", journal="JMIR Serious Games", year="2025", month="Apr", day="30", volume="13", pages="e69428", keywords="virtual reality", keywords="head-mounted display", keywords="immersive technology", keywords="artificial intelligence", keywords="generative pretrained transformer", keywords="occupational therapy", keywords="objective structured clinical examination", keywords="simulation", keywords="psychology", keywords="anxiety", abstract="Background: Immersive virtual reality (VR) and artificial intelligence have been used to determine whether a simulated clinical exam setting can reduce anxiety in first-year occupational therapy students preparing for objective structured clinical examinations (OSCEs). Test anxiety is common among postsecondary students, leading to negative outcomes such as increased dropout risk, lower grades, and limited employment opportunities. Students unfamiliar with specific testing environments are particularly prone to anxiety. VR simulations of OSCEs may allow students to become familiar with the exam setting and reduce anxiety. Objective: This study aimed to assess the efficacy of a VR simulation depicting clinical settings to reduce student anxiety about a clinical exam while gathering perspectives on their first-year coursework experiences to better understand their learning environment. Methods: An experimental, nonrandomized controlled trial compared state anxiety, trait test anxiety, and OSCE grades in 2 groups of first-year occupational therapy students analyzed using independent t tests (2-tailed). Group 1 (NoVR) was not exposed to the VR simulation and acted as a control group for group 2 (YesVR), who were exposed to the VR simulation. The VR used artificial intelligence in the form of a generative pretrained transformer to generate responses from virtual patients as students interacted with them in natural language. Self-reported psychometric scales measured anxiety levels 3 days before the OSCE. YesVR students completed perceived preparation surveys at 2 time points---3 weeks and 3 days before the OSCE---analyzed using dependent t tests. Semistructured interviews and focus groups were conducted within 1 week after the OSCE. Student perspectives on their classes and VR experiences were summarized using interpretative thematic analysis. Results: In total, 60 students---32 (53\%) in the NoVR group and 28 (47\%) in the YesVR group---participated in the study, and the YesVR group showed a significant reduction in state anxiety (t58=3.96; P<.001; Cohen d=1.02). The mean difference was 11.96 units (95\% CI 5.92-18.01). Trait test anxiety and OSCE scores remained static between groups. There was an increase in all perceived preparedness variables in the YesVR group. In total, 42\% (25/60) of the participants took part in interviews and focus groups, providing major themes regarding factors that affect OSCE performance, including student experience and background, feedback and support, fear of unknown, self-consciousness, and knowledge of the exam environment. Conclusions: Intolerance of uncertainty may lead students to interpret ambiguous exam situations as overly precarious. Findings suggest that VR simulation was associated with reduced state anxiety, although results from this small, nonrandomized sample should be interpreted cautiously. Qualitative data indicated that VR helped students gain familiarity with clinical exam settings, potentially decreasing uncertainty-based anxiety. Future research with larger or randomized samples is needed to confirm these findings and explore advanced VR tools offering feedback to enhance learning. ", doi="10.2196/69428", url="https://games.jmir.org/2025/1/e69428" } @Article{info:doi/10.2196/64486, author="Wang, Ling and Li, Jinglin and Zhuang, Boyang and Huang, Shasha and Fang, Meilin and Wang, Cunze and Li, Wen and Zhang, Mohan and Gong, Shurong", title="Accuracy of Large Language Models When Answering Clinical Research Questions: Systematic Review and Network Meta-Analysis", journal="J Med Internet Res", year="2025", month="Apr", day="30", volume="27", pages="e64486", keywords="large language models", keywords="LLM", keywords="clinical research questions", keywords="accuracy", keywords="network meta-analysis", keywords="PRISMA", abstract="Background: Large language models (LLMs) have flourished and gradually become an important research and application direction in the medical field. However, due to the high degree of specialization, complexity, and specificity of medicine, which results in extremely high accuracy requirements, controversy remains about whether LLMs can be used in the medical field. More studies have evaluated the performance of various types of LLMs in medicine, but the conclusions are inconsistent. Objective: This study uses a network meta-analysis (NMA) to assess the accuracy of LLMs when answering clinical research questions to provide high-level evidence-based evidence for its future development and application in the medical field. Methods: In this systematic review and NMA, we searched PubMed, Embase, Web of Science, and Scopus from inception until October 14, 2024. Studies on the accuracy of LLMs when answering clinical research questions were included and screened by reading published reports. The systematic review and NMA were conducted to compare the accuracy of different LLMs when answering clinical research questions, including objective questions, open-ended questions, top 1 diagnosis, top 3 diagnosis, top 5 diagnosis, and triage and classification. The NMA was performed using Bayesian frequency theory methods. Indirect intercomparisons between programs were performed using a grading scale. A larger surface under the cumulative ranking curve (SUCRA) value indicates a higher ranking of the corresponding LLM accuracy. Results: The systematic review and NMA examined 168 articles encompassing 35,896 questions and 3063 clinical cases. Of the 168 studies, 40 (23.8\%) were considered to have a low risk of bias, 128 (76.2\%) had a moderate risk, and none were rated as having a high risk. ChatGPT-4o (SUCRA=0.9207) demonstrated strong performance in terms of accuracy for objective questions, followed by Aeyeconsult (SUCRA=0.9187) and ChatGPT-4 (SUCRA=0.8087). ChatGPT-4 (SUCRA=0.8708) excelled at answering open-ended questions. In terms of accuracy for top 1 diagnosis and top 3 diagnosis of clinical cases, human experts (SUCRA=0.9001 and SUCRA=0.7126, respectively) ranked the highest, while Claude 3 Opus (SUCRA=0.9672) performed well at the top 5 diagnosis. Gemini (SUCRA=0.9649) had the highest rated SUCRA value for accuracy in the area of triage and classification. Conclusions: Our study indicates that ChatGPT-4o has an advantage when answering objective questions. For open-ended questions, ChatGPT-4 may be more credible. Humans are more accurate at the top 1 diagnosis and top 3 diagnosis. Claude 3 Opus performs better at the top 5 diagnosis, while for triage and classification, Gemini is more advantageous. This analysis offers valuable insights for clinicians and medical practitioners, empowering them to effectively leverage LLMs for improved decision-making in learning, diagnosis, and management of various clinical scenarios. Trial Registration: PROSPERO CRD42024558245; https://www.crd.york.ac.uk/PROSPERO/view/CRD42024558245 ", doi="10.2196/64486", url="https://www.jmir.org/2025/1/e64486", url="http://www.ncbi.nlm.nih.gov/pubmed/40305085" } @Article{info:doi/10.2196/68762, author="Yang, Ting-ting and Zheng, Hong-xia and Cao, Sha and Jing, Mei-ling and Hu, Ju and Zuo, Yan and Chen, Qing-yong and Zhang, Jian-jun", title="Harnessing an Artificial Intelligence--Based Large Language Model With Personal Health Record Capability for Personalized Information Support in Postsurgery Myocardial Infarction: Descriptive Qualitative Study", journal="J Med Internet Res", year="2025", month="Apr", day="30", volume="27", pages="e68762", keywords="myocardial infarction", keywords="post-surgery recovery", keywords="personalized health support", keywords="artificial intelligence", keywords="large language model", keywords="personal health record", keywords="digital health tools", keywords="health information accessibility", keywords="qualitative study", keywords="mobile phone", abstract="Background: Myocardial infarction (MI) remains a leading cause of morbidity and mortality worldwide. Although postsurgical cardiac interventions have improved survival rates, effective management during recovery remains challenging. Traditional informational support systems often provide generic guidance that does not account for individualized medical histories or psychosocial factors. Recently, artificial intelligence (AI)--based large language models (LLM) tools have emerged as promising interventions to deliver personalized health information to post-MI patients. Objective: We aim to explore the user experiences and perceptions of an AI-based LLM tool (iflyhealth) with integrated personal health record functionality in post-MI care, assess how patients and their family members engaged with the tool during recovery, identify the perceived benefits and challenges of using the technology, and to understand the factors promoting or hindering continued use. Methods: A purposive sample of 20 participants (12 users and 8 nonusers) who underwent MI surgery within the previous 6 months was recruited between July and August 2024. Data were collected through semistructured, face-to-face interviews conducted in a private setting, using an interview guide to address participants' first impressions, usage patterns, and reasons for adoption or nonadoption of the iflyhealth app. The interviews were audio-recorded, transcribed verbatim, and analyzed using Colaizzi method. Results: Four key themes revealed included: (1) participants' experiences varied based on digital literacy, prior exposure to health technologies, and individual recovery needs; (2) users appreciated the app's enhanced accessibility to professional health information, personalized advice tailored to their clinical conditions, and the tool's responsiveness to health status changes; (3) challenges such as difficulties with digital literacy, usability concerns, and data privacy issues were significant barriers; and (4) nonusers and those who discontinued use primarily cited complexity of the interface and perceived limited relevance of the advice as major deterrents. Conclusions: iflyhealth, an LLM AI app with a built-in personal health record functionality, shows significant potential in assisting post-MI patients. The main benefits reported by iflyhealth users include improved access to personalized health information and an enhanced ability to respond to changing health conditions. However, challenges such as digital literacy, usability, and privacy and security concerns persist. Overcoming the barriers may further enhance the use of the iflyhealth app, which can play an important role in patient-centered, personalized post-MI management. ", doi="10.2196/68762", url="https://www.jmir.org/2025/1/e68762", url="http://www.ncbi.nlm.nih.gov/pubmed/40305084" } @Article{info:doi/10.2196/64963, author="Shan, Guxue and Chen, Xiaonan and Wang, Chen and Liu, Li and Gu, Yuanjing and Jiang, Huiping and Shi, Tingqi", title="Comparing Diagnostic Accuracy of Clinical Professionals and Large Language Models: Systematic Review and Meta-Analysis", journal="JMIR Med Inform", year="2025", month="Apr", day="25", volume="13", pages="e64963", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="large language model", keywords="LLM", keywords="natural language processing", keywords="algorithm", keywords="model", keywords="analytics", keywords="NLP", keywords="deep learning", keywords="clinical diagnosis", keywords="diagnosis", keywords="diagnostic accuracy", keywords="accuracy", keywords="systematic review", abstract="Background: With the rapid development of artificial intelligence (AI) technology, especially generative AI, large language models (LLMs) have shown great potential in the medical field. Through massive medical data training, it can understand complex medical texts and can quickly analyze medical records and provide health counseling and diagnostic advice directly, especially in rare diseases. However, no study has yet compared and extensively discussed the diagnostic performance of LLMs with that of physicians. Objective: This study systematically reviewed the accuracy of LLMs in clinical diagnosis and provided reference for further clinical application. Methods: We conducted searches in CNKI (China National Knowledge Infrastructure), VIP Database, SinoMed, PubMed, Web of Science, Embase, and CINAHL (Cumulative Index to Nursing and Allied Health Literature) from January 1, 2017, to the present. A total of 2 reviewers independently screened the literature and extracted relevant information. The risk of bias was assessed using the Prediction Model Risk of Bias Assessment Tool (PROBAST), which evaluates both the risk of bias and the applicability of included studies. Results: A total of 30 studies involving 19 LLMs and a total of 4762 cases were included. The quality assessment indicated a high risk of bias in the majority of studies, primary cause is known case diagnosis. For the optimal model, the accuracy of the primary diagnosis ranged from 25\% to 97.8\%, while the triage accuracy ranged from 66.5\% to 98\%. Conclusions: LLMs have demonstrated considerable diagnostic capabilities and significant potential for application across various clinical cases. Although their accuracy still falls short of that of clinical professionals, if used cautiously, they have the potential to become one of the best intelligent assistants in the field of human health care. ", doi="10.2196/64963", url="https://medinform.jmir.org/2025/1/e64963" } @Article{info:doi/10.2196/70566, author="AlFarabi Ali, Sarah and AlDehlawi, Hebah and Jazzar, Ahoud and Ashi, Heba and Esam Abuzinadah, Nihal and AlOtaibi, Mohammad and Algarni, Abdulrahman and Alqahtani, Hazzaa and Akeel, Sara and Almazrooa, Soulafa", title="The Diagnostic Performance of Large Language Models and Oral Medicine Consultants for Identifying Oral Lesions in Text-Based Clinical Scenarios: Prospective Comparative Study", journal="JMIR AI", year="2025", month="Apr", day="24", volume="4", pages="e70566", keywords="artificial intelligence", keywords="ChatGPT", keywords="Copilot", keywords="diagnosis", keywords="oral medicine", keywords="diagnostic performance", keywords="large language model", keywords="lesion", keywords="oral lesion", abstract="Background: The use of artificial intelligence (AI), especially large language models (LLMs), is increasing in health care, including in dentistry. There has yet to be an assessment of the diagnostic performance of LLMs in oral medicine. Objective: We aimed to compare the effectiveness of ChatGPT (OpenAI) and Microsoft Copilot (integrated within the Microsoft 365 suite) with oral medicine consultants in formulating accurate differential and final diagnoses for oral lesions from written clinical scenarios. Methods: Fifty comprehensive clinical case scenarios including patient age, presenting complaint, history of the presenting complaint, medical history, allergies, intra- and extraoral findings, lesion description, and any additional information including laboratory investigations and specific clinical features were given to three oral medicine consultants, who were asked to formulate a differential diagnosis and a final diagnosis. Specific prompts for the same 50 cases were designed and input into ChatGPT and Copilot to formulate both differential and final diagnoses. The diagnostic accuracy was compared between the LLMs and oral medicine consultants. Results: ChatGPT exhibited the highest accuracy, providing the correct differential diagnoses in 37 of 50 cases (74\%). There were no significant differences in the accuracy of providing the correct differential diagnoses between AI models and oral medicine consultants. ChatGPT was as accurate as consultants in making the final diagnoses, but Copilot was significantly less accurate than ChatGPT (P=.015) and one of the oral medicine consultants (P<.001) in providing the correct final diagnosis. Conclusions: ChatGPT and Copilot show promising performance for diagnosing oral medicine pathology in clinical case scenarios to assist dental practitioners. ChatGPT-4 and Copilot are still evolving, but even now, they might provide a significant advantage in the clinical setting as tools to help dental practitioners in their daily practice. ", doi="10.2196/70566", url="https://ai.jmir.org/2025/1/e70566" } @Article{info:doi/10.2196/71521, author="Sakaguchi, Kota and Sakama, Reiko and Watari, Takashi", title="Evaluating ChatGPT in Qualitative Thematic Analysis With Human Researchers in the Japanese Clinical Context and Its Cultural Interpretation Challenges: Comparative Qualitative Study", journal="J Med Internet Res", year="2025", month="Apr", day="24", volume="27", pages="e71521", keywords="ChatGPT", keywords="large language models", keywords="qualitative research", keywords="sacred moment(s)", keywords="thematic analysis", abstract="Background: Qualitative research is crucial for understanding the values and beliefs underlying individual experiences, emotions, and behaviors, particularly in social sciences and health care. Traditionally reliant on manual analysis by experienced researchers, this methodology requires significant time and effort. The advent of artificial intelligence (AI) technology, especially large language models such as ChatGPT (OpenAI), holds promise for enhancing qualitative data analysis. However, existing studies have predominantly focused on AI's application to English-language datasets, leaving its applicability to non-English languages, particularly structurally and contextually complex languages such as Japanese, insufficiently explored. Objective: This study aims to evaluate the feasibility, strengths, and limitations of ChatGPT-4 in analyzing qualitative Japanese interview data by directly comparing its performance with that of experienced human researchers. Methods: A comparative qualitative study was conducted to assess the performance of ChatGPT-4 and human researchers in analyzing transcribed Japanese semistructured interviews. The analysis focused on thematic agreement rates, interpretative depth, and ChatGPT's ability to process culturally nuanced concepts, particularly for descriptive and socio-culturally embedded themes. This study analyzed transcripts from 30 semistructured interviews conducted between February and March 2024 in an urban community hospital (Hospital A) and a rural university hospital (Hospital B) in Japan. Interviews centered on the theme of ``sacred moments'' and involved health care providers and patients. Transcripts were digitized using NVivo (version 14; Lumivero) and analyzed using ChatGPT-4 with iterative prompts for thematic analysis. The results were compared with a reflexive thematic analysis performed by human researchers. Furthermore, to assess the adaptability and consistency of ChatGPT in qualitative analysis, Charmaz's grounded theory and Pope's five-step framework approach were applied. Results: ChatGPT-4 demonstrated high thematic agreement rates (>80\%) with human researchers for descriptive themes such as ``personal experience of a sacred moment'' and ``building relationships.'' However, its performance declined for themes requiring deeper cultural and emotional interpretation, such as ``difficult to answer, no experience of sacred moments'' and ``fate.'' For these themes, agreement rates were approximately 30\%, revealing significant limitations in ChatGPT's ability to process context-dependent linguistic structures and implicit emotional expressions in Japanese. Conclusions: ChatGPT-4 demonstrates potential as an auxiliary tool in qualitative research, particularly for efficiently identifying descriptive themes within Japanese-language datasets. However, its limited capacity to interpret cultural and emotional nuances highlights the continued necessity of human expertise in qualitative analysis. These findings emphasize the complementary role of AI-assisted qualitative research and underscore the importance of further advancements in AI models tailored to non-English linguistic and cultural contexts. Future research should explore strategies to enhance AI's interpretability, expand multilingual training datasets, and assess the applicability of emerging AI models in diverse cultural settings. In addition, ethical and legal considerations in AI-driven qualitative analysis require continued scrutiny. ", doi="10.2196/71521", url="https://www.jmir.org/2025/1/e71521" } @Article{info:doi/10.2196/65670, author="Weisman, Dan and Sugarman, Alanna and Huang, Ming Yue and Gelberg, Lillian and Ganz, A. Patricia and Comulada, Scott Warren", title="Development of a GPT-4--Powered Virtual Simulated Patient and Communication Training Platform for Medical Students to Practice Discussing Abnormal Mammogram Results With Patients: Multiphase Study", journal="JMIR Form Res", year="2025", month="Apr", day="17", volume="9", pages="e65670", keywords="standardized patient", keywords="virtual simulated patient", keywords="artificial intelligence", keywords="AI", keywords="large language model", keywords="LLM", keywords="GPT-4", keywords="agent", keywords="communication skills training", keywords="abnormal mammography results", keywords="biopsy", abstract="Background: Standardized patients (SPs) prepare medical students for difficult conversations with patients. Despite their value, SP-based simulation training is constrained by available resources and competing clinical demands. Researchers are turning to artificial intelligence and large language models, such as generative pretrained transformers, to create communication training that incorporates virtual simulated patients (VSPs). GPT-4 is a large language model advance allowing developers to design virtual simulation scenarios using text-based prompts instead of relying on branching path simulations with prescripted dialogue. These nascent developmental practices have not taken root in the literature to guide other researchers in developing their own simulations. Objective: This study aims to describe our developmental process and lessons learned for creating a GPT-4--driven VSP. We designed the VSP to help medical student learners rehearse discussing abnormal mammography results with a patient as a primary care physician (PCP). We aimed to assess GPT-4's ability to generate appropriate VSP responses to learners during spoken conversations and provide appropriate feedback on learner performance. Methods: A research team comprised of physicians, a medical student, an educator, an SP program director, a learning experience designer, and a health care researcher conducted the study. A formative phase with in-depth knowledge user interviews informed development, followed by a development phase to create the virtual training module. The team conducted interviews with 5 medical students, 5 PCPs, and 5 breast cancer survivors. They then developed a VSP using simulation authoring software and provided the GPT-4--enabled VSP with an initial prompt consisting of a scenario description, emotional state, and expectations for learner dialogue. It was iteratively refined through an agile design process involving repeated cycles of testing, documenting issues, and revising the prompt. As an exploratory feature, the simulation used GPT-4 to provide written feedback to learners about their performance communicating with the VSP and their adherence to guidelines for difficult conversations. Results: In-depth interviews helped establish the appropriate timing, mode of communication, and protocol for conversations between PCPs and patients during the breast cancer screening process. The scenario simulated a telephone call between a physician and patient to discuss the abnormal results of a diagnostic mammogram that that indicated a need for a biopsy. Preliminary testing was promising. The VSP asked sensible questions about their mammography results and responded to learner inquiries using a voice replete with appropriate emotional inflections. GPT-4 generated performance feedback that successfully identified strengths and areas for improvement using relevant quotes from the learner-VSP conversation, but it occasionally misidentified learner adherence to communication protocols. Conclusions: GPT-4 streamlined development and facilitated more dynamic, humanlike interactions between learners and the VSP compared to branching path simulations. For the next steps, we will pilot-test the VSP with medical students to evaluate its feasibility and acceptability. ", doi="10.2196/65670", url="https://formative.jmir.org/2025/1/e65670" } @Article{info:doi/10.2196/63786, author="Yang, Xiongwen and Xiao, Yi and Liu, Di and Shi, Huiyou and Deng, Huiyin and Huang, Jian and Zhang, Yun and Liu, Dan and Liang, Maoli and Jin, Xing and Sun, Yongpan and Yao, Jing and Zhou, XiaoJiang and Guo, Wankai and He, Yang and Tang, Weijuan and Xu, Chuan", title="Enhancing Physician-Patient Communication in Oncology Using GPT-4 Through Simplified Radiology Reports: Multicenter Quantitative Study", journal="J Med Internet Res", year="2025", month="Apr", day="17", volume="27", pages="e63786", keywords="radiology reports", keywords="doctor-patient communication", keywords="large language models", keywords="oncology", keywords="GPT-4", abstract="Background: Effective physician-patient communication is essential in clinical practice, especially in oncology, where radiology reports play a crucial role. These reports are often filled with technical jargon, making them challenging for patients to understand and affecting their engagement and decision-making. Large language models, such as GPT-4, offer a novel approach to simplifying these reports and potentially enhancing communication and patient outcomes. Objective: We aimed to assess the feasibility and effectiveness of using GPT-4 to simplify oncological radiology reports to improve physician-patient communication. Methods: In a retrospective study approved by the ethics review committees of multiple hospitals, 698 radiology reports for malignant tumors produced between October 2023 and December 2023 were analyzed. In total, 70 (10\%) reports were selected to develop templates and scoring scales for GPT-4 to create simplified interpretative radiology reports (IRRs). Radiologists checked the consistency between the original radiology reports and the IRRs, while volunteer family members of patients, all of whom had at least a junior high school education and no medical background, assessed readability. Doctors evaluated communication efficiency through simulated consultations. Results: Transforming original radiology reports into IRRs resulted in clearer reports, with word count increasing from 818.74 to 1025.82 (P<.001), volunteers' reading time decreasing from 674.86 seconds to 589.92 seconds (P<.001), and reading rate increasing from 72.15 words per minute to 104.70 words per minute (P<.001). Physician-patient communication time significantly decreased, from 1116.11 seconds to 745.30 seconds (P<.001), and patient comprehension scores improved from 5.51 to 7.83 (P<.001). Conclusions: This study demonstrates the significant potential of large language models, specifically GPT-4, to facilitate medical communication by simplifying oncological radiology reports. Simplified reports enhance patient understanding and the efficiency of doctor-patient interactions, suggesting a valuable application of artificial intelligence in clinical practice to improve patient outcomes and health care communication. ", doi="10.2196/63786", url="https://www.jmir.org/2025/1/e63786" } @Article{info:doi/10.2196/70535, author="Li, Caixia and Zhao, Yina and Bai, Yang and Zhao, Baoquan and Tola, Oluwafunmilayo Yetunde and Chan, WH Carmen and Zhang, Meifen and Fu, Xia", title="Unveiling the Potential of Large Language Models in Transforming Chronic Disease Management: Mixed Methods Systematic Review", journal="J Med Internet Res", year="2025", month="Apr", day="16", volume="27", pages="e70535", keywords="artificial intelligence", keywords="chronic disease", keywords="health management", keywords="large language model", keywords="systematic review", abstract="Background: Chronic diseases are a major global health burden, accounting for nearly three-quarters of the deaths worldwide. Large language models (LLMs) are advanced artificial intelligence systems with transformative potential to optimize chronic disease management; however, robust evidence is lacking. Objective: This review aims to synthesize evidence on the feasibility, opportunities, and challenges of LLMs across the disease management spectrum, from prevention to screening, diagnosis, treatment, and long-term care. Methods: Following the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analysis) guidelines, 11 databases (Cochrane Central Register of Controlled Trials, CINAHL, Embase, IEEE Xplore, MEDLINE via Ovid, ProQuest Health \& Medicine Collection, ScienceDirect, Scopus, Web of Science Core Collection, China National Knowledge Internet, and SinoMed) were searched on April 17, 2024. Intervention and simulation studies that examined LLMs in the management of chronic diseases were included. The methodological quality of the included studies was evaluated using a rating rubric designed for simulation-based research and the risk of bias in nonrandomized studies of interventions tool for quasi-experimental studies. Narrative analysis with descriptive figures was used to synthesize the study findings. Random-effects meta-analyses were conducted to assess the pooled effect estimates of the feasibility of LLMs in chronic disease management. Results: A total of 20 studies examined general-purpose (n=17) and retrieval-augmented generation-enhanced LLMs (n=3) for the management of chronic diseases, including cancer, cardiovascular diseases, and metabolic disorders. LLMs demonstrated feasibility across the chronic disease management spectrum by generating relevant, comprehensible, and accurate health recommendations (pooled accurate rate 71\%, 95\% CI 0.59-0.83; I2=88.32\%) with retrieval-augmented generation-enhanced LLMs having higher accuracy rates compared to general-purpose LLMs (odds ratio 2.89, 95\% CI 1.83-4.58; I2=54.45\%). LLMs facilitated equitable information access; increased patient awareness regarding ailments, preventive measures, and treatment options; and promoted self-management behaviors in lifestyle modification and symptom coping. Additionally, LLMs facilitate compassionate emotional support, social connections, and health care resources to improve the health outcomes of chronic diseases. However, LLMs face challenges in addressing privacy, language, and cultural issues; undertaking advanced tasks, including diagnosis, medication, and comorbidity management; and generating personalized regimens with real-time adjustments and multiple modalities. Conclusions: LLMs have demonstrated the potential to transform chronic disease management at the individual, social, and health care levels; however, their direct application in clinical settings is still in its infancy. A multifaceted approach that incorporates robust data security, domain-specific model fine-tuning, multimodal data integration, and wearables is crucial for the evolution of LLMs into invaluable adjuncts for health care professionals to transform chronic disease management. Trial Registration: PROSPERO CRD42024545412; https://www.crd.york.ac.uk/PROSPERO/view/CRD42024545412 ", doi="10.2196/70535", url="https://www.jmir.org/2025/1/e70535" } @Article{info:doi/10.2196/62909, author="Nair, Subjagouri Rakhi Asokkumar and Hartung, Matthias and Heinisch, Philipp and Jaskolski, Janik and Starke-Kn{\"a}usel, Cornelius and Ver{\'i}ssimo, Susana and Schmidt, Maria David and Cimiano, Philipp", title="Summarizing Online Patient Conversations Using Generative Language Models: Experimental and Comparative Study", journal="JMIR Med Inform", year="2025", month="Apr", day="14", volume="13", pages="e62909", keywords="patient experience", keywords="online communities", keywords="summarizing", keywords="large language models", abstract="Background: Social media is acknowledged by regulatory bodies (eg, the Food and Drug Administration) as an important source of patient experience data to learn about patients' unmet needs, priorities, and preferences. However, current methods rely either on manual analysis and do not scale, or on automatic processing, yielding mainly quantitative insights. Methods that can automatically summarize texts and yield qualitative insights at scale are missing. Objective: The objective of this study was to evaluate to what extent state-of-the-art large language models can appropriately summarize posts shared by patients in web-based forums and health communities. Specifically, the goal was to compare the performance of different language models and prompting strategies on the task of summarizing documents reflecting the experiences of individual patients. Methods: In our experimental and comparative study, we applied 3 different language models (Flan-T5, Generative Pretrained Transformer [GPT], GPT-3, and GPT-3.5) in combination with various prompting strategies to the task of summarizing posts from patients in online communities. The generated summaries were evaluated with respect to 124 manually created summaries as a ground-truth reference. As evaluation metrics, we used 2 standard metrics from the field of text generation, namely, Recall-Oriented Understudy for Gisting Evaluation (ROUGE) and BERTScore, to compare the automatically generated summaries to the manually created reference summaries. Results: Among the zero-shot prompting--based large language models investigated, GPT-3.5 performed better than the other models with respect to the ROUGE metrics, as well as with respect to BERTScore. While zero-shot prompting seems to be a good prompting strategy, overall GPT-3.5 in combination with directional stimulus prompting in a 3-shot setting had the best results with respect to the aforementioned metrics. A manual investigation of the summarization of the best-performing method showed that the generated summaries were accurate and plausible compared to the manual summaries. Conclusions: Taken together, our results suggest that state-of-the-art pretrained language models are a valuable tool to provide qualitative insights about the patient experience to better understand unmet needs, patient priorities, and how a disease impacts daily functioning and quality of life to inform processes aimed at improving health care delivery and ensure that drug development focuses more on the actual priorities and unmet needs of patients. The key limitations of our work are the small data sample as well as the fact that the manual summaries were created by 1 annotator only. Furthermore, the results hold only for the examined models and prompting strategies, potentially not generalizing to other models and strategies. ", doi="10.2196/62909", url="https://medinform.jmir.org/2025/1/e62909" } @Article{info:doi/10.2196/62857, author="Yan, Zelin and Liu, Jingwen and Fan, Yihong and Lu, Shiyuan and Xu, Dingting and Yang, Yun and Wang, Honggang and Mao, Jie and Tseng, Hou-Chiang and Chang, Tao-Hsing and Chen, Yan", title="Ability of ChatGPT to Replace Doctors in Patient Education: Cross-Sectional Comparative Analysis of Inflammatory Bowel Disease", journal="J Med Internet Res", year="2025", month="Mar", day="31", volume="27", pages="e62857", keywords="AI-assisted", keywords="patient education", keywords="inflammatory bowel disease", keywords="artificial intelligence", keywords="ChatGPT", keywords="patient communities", keywords="social media", keywords="disease management", keywords="readability", keywords="online health information", keywords="conversational agents", abstract="Background: Although large language models (LLMs) such as ChatGPT show promise for providing specialized information, their quality requires further evaluation. This is especially true considering that these models are trained on internet text and the quality of health-related information available online varies widely. Objective: The aim of this study was to evaluate the performance of ChatGPT in the context of patient education for individuals with chronic diseases, comparing it with that of industry experts to elucidate its strengths and limitations. Methods: This evaluation was conducted in September 2023 by analyzing the responses of ChatGPT and specialist doctors to questions posed by patients with inflammatory bowel disease (IBD). We compared their performance in terms of subjective accuracy, empathy, completeness, and overall quality, as well as readability to support objective analysis. Results: In a series of 1578 binary choice assessments, ChatGPT was preferred in 48.4\% (95\% CI 45.9\%-50.9\%) of instances. There were 12 instances where ChatGPT's responses were unanimously preferred by all evaluators, compared with 17 instances for specialist doctors. In terms of overall quality, there was no significant difference between the responses of ChatGPT (3.98, 95\% CI 3.93-4.02) and those of specialist doctors (3.95, 95\% CI 3.90-4.00; t524=0.95, P=.34), both being considered ``good.'' Although differences in accuracy (t521=0.48, P=.63) and empathy (t511=2.19, P=.03) lacked statistical significance, the completeness of textual output (t509=9.27, P<.001) was a distinct advantage of the LLM (ChatGPT). In the sections of the questionnaire where patients and doctors responded together (Q223-Q242), ChatGPT demonstrated inferior performance (t36=2.91, P=.006). Regarding readability, no statistical difference was found between the responses of specialist doctors (median: 7th grade; Q1: 4th grade; Q3: 8th grade) and those of ChatGPT (median: 7th grade; Q1: 7th grade; Q3: 8th grade) according to the Mann-Whitney U test (P=.09). The overall quality of ChatGPT's output exhibited strong correlations with other subdimensions (with empathy: r=0.842; with accuracy: r=0.839; with completeness: r=0.795), and there was also a high correlation between the subdimensions of accuracy and completeness (r=0.762). Conclusions: ChatGPT demonstrated more stable performance across various dimensions. Its output of health information content is more structurally sound, addressing the issue of variability in the information from individual specialist doctors. ChatGPT's performance highlights its potential as an auxiliary tool for health information, despite limitations such as artificial intelligence hallucinations. It is recommended that patients be involved in the creation and evaluation of health information to enhance the quality and relevance of the information. ", doi="10.2196/62857", url="https://www.jmir.org/2025/1/e62857" } @Article{info:doi/10.2196/65178, author="West, Matthew and Cheng, You and He, Yingnan and Leng, Yu and Magdamo, Colin and Hyman, T. Bradley and Dickson, R. John and Serrano-Pozo, Alberto and Blacker, Deborah and Das, Sudeshna", title="Unsupervised Deep Learning of Electronic Health Records to Characterize Heterogeneity Across Alzheimer Disease and Related Dementias: Cross-Sectional Study", journal="JMIR Aging", year="2025", month="Mar", day="31", volume="8", pages="e65178", keywords="Alzheimer disease and related dementias", keywords="electronic health records", keywords="large language models", keywords="clustering", keywords="unsupervised learning", abstract="Background: Alzheimer disease and related dementias (ADRD) exhibit prominent heterogeneity. Identifying clinically meaningful ADRD subtypes is essential for tailoring treatments to specific patient phenotypes. Objective: We aimed to use unsupervised learning techniques on electronic health records (EHRs) from memory clinic patients to identify ADRD subtypes. Methods: We used pretrained embeddings of non-ADRD diagnosis codes (International Classification of Diseases, Ninth Revision) and large language model (LLM)--derived embeddings of clinical notes from patient EHRs. Hierarchical clustering of these embeddings was used to identify ADRD subtypes. Clusters were characterized regarding their demographic and clinical features. Results: We analyzed a cohort of 3454 patients with ADRD from a memory clinic at Massachusetts General Hospital, each with a specialist diagnosis. Clustering pretrained embeddings of the non-ADRD diagnosis codes in patient EHRs revealed the following 3 patient subtypes: one with skin conditions, another with psychiatric disorders and an earlier age of onset, and a third with diabetes complications. Similarly, using LLM-derived embeddings of clinical notes, we identified 3 subtypes of patients as follows: one with psychiatric manifestations and higher prevalence of female participants (prevalence ratio: 1.59), another with cardiovascular and motor problems and higher prevalence of male participants (prevalence ratio: 1.75), and a third one with geriatric health disorders. Notably, we observed significant overlap between clusters from both data modalities ($\chi$24=89.4; P<.001). Conclusions: By integrating International Classification of Diseases, Ninth Revision codes and LLM-derived embeddings, our analysis delineated 2 distinct ADRD subtypes with sex-specific comorbid and clinical presentations, offering insights for potential precision medicine approaches. ", doi="10.2196/65178", url="https://aging.jmir.org/2025/1/e65178" } @Article{info:doi/10.2196/65729, author="Miletic, Marko and Sariyar, Murat", title="Utility-based Analysis of Statistical Approaches and Deep Learning Models for Synthetic Data Generation With Focus on Correlation Structures: Algorithm Development and Validation", journal="JMIR AI", year="2025", month="Mar", day="20", volume="4", pages="e65729", keywords="synthetic data generation", keywords="medical data synthesis", keywords="random forests", keywords="simulation study", keywords="deep learning", keywords="propensity score mean-squared error", abstract="Background: Recent advancements in Generative Adversarial Networks and large language models (LLMs) have significantly advanced the synthesis and augmentation of medical data. These and other deep learning--based methods offer promising potential for generating high-quality, realistic datasets crucial for improving machine learning applications in health care, particularly in contexts where data privacy and availability are limiting factors. However, challenges remain in accurately capturing the complex associations inherent in medical datasets. Objective: This study evaluates the effectiveness of various Synthetic Data Generation (SDG) methods in replicating the correlation structures inherent in real medical datasets. In addition, it examines their performance in downstream tasks using Random Forests (RFs) as the benchmark model. To provide a comprehensive analysis, alternative models such as eXtreme Gradient Boosting and Gated Additive Tree Ensembles are also considered. We compare the following SDG approaches: Synthetic Populations in R (synthpop), copula, copulagan, Conditional Tabular Generative Adversarial Network (ctgan), tabular variational autoencoder (tvae), and tabula for LLMs. Methods: We evaluated synthetic data generation methods using both real-world and simulated datasets. Simulated data consist of 10 Gaussian variables and one binary target variable with varying correlation structures, generated via Cholesky decomposition. Real-world datasets include the body performance dataset with 13,393 samples for fitness classification, the Wisconsin Breast Cancer dataset with 569 samples for tumor diagnosis, and the diabetes dataset with 768 samples for diabetes prediction. Data quality is evaluated by comparing correlation matrices, the propensity score mean-squared error (pMSE) for general utility, and F1-scores for downstream tasks as a specific utility metric, using training on synthetic data and testing on real data. Results: Our simulation study, supplemented with real-world data analyses, shows that the statistical methods copula and synthpop consistently outperform deep learning approaches across various sample sizes and correlation complexities, with synthpop being the most effective. Deep learning methods, including large LLMs, show mixed performance, particularly with smaller datasets or limited training epochs. LLMs often struggle to replicate numerical dependencies effectively. In contrast, methods like tvae with 10,000 epochs perform comparably well. On the body performance dataset, copulagan achieves the best performance in terms of pMSE. The results also highlight that model utility depends more on the relative correlations between features and the target variable than on the absolute magnitude of correlation matrix differences. Conclusions: Statistical methods, particularly synthpop, demonstrate superior robustness and utility preservation for synthetic tabular data compared with deep learning approaches. Copula methods show potential but face limitations with integer variables. Deep Learning methods underperform in this context. Overall, these findings underscore the dominance of statistical methods for synthetic data generation for tabular data, while highlighting the niche potential of deep learning approaches for highly complex datasets, provided adequate resources and tuning. ", doi="10.2196/65729", url="https://ai.jmir.org/2025/1/e65729" } @Article{info:doi/10.2196/70481, author="Nazar, Wojciech and Nazar, Grzegorz and Kami?ska, Aleksandra and Danilowicz-Szymanowicz, Ludmila", title="How to Design, Create, and Evaluate an Instruction-Tuning Dataset for Large Language Model Training in Health Care: Tutorial From a Clinical Perspective", journal="J Med Internet Res", year="2025", month="Mar", day="18", volume="27", pages="e70481", keywords="generative artificial intelligence", keywords="large language models", keywords="instruction-tuning datasets", keywords="tutorials", keywords="evaluation framework", keywords="health care", doi="10.2196/70481", url="https://www.jmir.org/2025/1/e70481" }