<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.0 20120330//EN" "http://jats.nlm.nih.gov/publishing/1.0/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article" xml:lang="en">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">IR</journal-id>
<journal-title-group>
<journal-title>Information Research</journal-title>
</journal-title-group>
<issn pub-type="epub">1368-1613</issn>
<publisher>
<publisher-name>University of Bor&#x00E5;s</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">ir30iConf47530</article-id>
<article-id pub-id-type="doi">10.47989/ir30iConf47530</article-id>
<article-categories>
<subj-group xml:lang="en">
<subject>Research article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Improving scholarship accessibility with reinforcement learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Wang</surname><given-names>Haining</given-names></name>
<xref ref-type="aff" rid="aff0001"/></contrib>
<contrib contrib-type="author"><name><surname>Clark</surname><given-names>Jason</given-names></name>
<xref ref-type="aff" rid="aff0002"/></contrib>
<contrib contrib-type="author"><name><surname>McKelvey</surname><given-names>Hannah</given-names></name>
<xref ref-type="aff" rid="aff0003"/></contrib>
<contrib contrib-type="author"><name><surname>Sterman</surname><given-names>Leila</given-names></name>
<xref ref-type="aff" rid="aff0004"/></contrib>
<contrib contrib-type="author"><name><surname>Zheng</surname><given-names>Gao</given-names></name>
<xref ref-type="aff" rid="aff0005"/></contrib>
<contrib contrib-type="author"><name><surname>Tian</surname><given-names>Zuoyu</given-names></name>
<xref ref-type="aff" rid="aff0006"/></contrib>
<contrib contrib-type="author"><name><surname>Liu</surname><given-names>Xiaozhong</given-names></name>
<xref ref-type="aff" rid="aff0007"/></contrib>
<aff id="aff0001"><bold>Haining Wang</bold> is a doctoral candidate in Information Science at Indiana University Bloomington. His research focuses on natural language processing with applications in the humanities, social sciences, and biomedical sciences. He can be reached at <email xlink:href="hw56@iu.edu">hw56@iu.edu</email>.</aff>
<aff id="aff0002"><bold>Jason A. Clark</bold> is a Professor and Head of Research Optimisation, Analytics, and Data Services (ROADS) at Montana State University Library. His research interests include machine learning, digital libraries, and the intersection of artificial intelligence with user experience, including algorithmic literacy and machine learning patterns. He can be contacted at <email xlink:href="jaclark@montana.edu">jaclark@montana.edu</email>.</aff>
<aff id="aff0003"><bold>Hannah McKelvey</bold> is an Associate Professor at Montana State University Library. Her research interests are focused on practice-based librarianship, focusing on electronic resource management, user discovery behaviour, and collection assessment through usage analytics. She can be contacted at <email xlink:href="hannah.mckelvey@montana.edu">hannah.mckelvey@montana.edu</email>.</aff>
<aff id="aff0004"><bold>Leila Sterman</bold> is an Associate Professor at Montana State University Library. Her research interests focus on scholarly communication, including open access publishing, copyright, and institutional repositories. She can be contacted at <email xlink:href="leila.sterman@montana.edu">leila.sterman@montana.edu</email>.</aff>
<aff id="aff0005"><bold>Zheng Gao</bold> is currently a senior algorithm engineer at Ant Group. His research interests encompass machine learning, deep learning, and large language models, including LLM post-training and evaluation. He can be contacted at <email xlink:href="gao27@alumni.iu.edu">gao27@alumni.iu.edu</email>.</aff>
<aff id="aff0006"><bold>Zuoyu Tian</bold> is an Assistant Professor at Macalester College. His research interests lie in computational linguistics and natural language processing, with a particular focus on applying computationally intensive methods to study language variation and change. He can be contacted at <email xlink:href="ztian@macalester.edu">ztian@macalester.edu</email>.</aff>
<aff id="aff0007"><bold>Xiaozhong Liu</bold> is an Associate Professor in Computer Science and Data Science at Worcester Polytechnic Institute. His research interests include natural language processing, text/graph mining, information retrieval/recommendation, metadata, and computational social science. He can be contacted at <email xlink:href="xliu14@wpi.edu">xliu14@wpi.edu</email>.</aff>
</contrib-group>
<pub-date pub-type="epub"><day>06</day><month>05</month><year>2025</year></pub-date>
<pub-date pub-type="collection"><year>2025</year></pub-date>
<volume>30</volume>
<issue>i</issue>
<fpage>203</fpage>
<lpage>218</lpage>
<permissions>
<copyright-year>2025</copyright-year>
<copyright-holder>&#x00A9; 2025 The Author(s).</copyright-holder>
<license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by-nc/4.0/">
<license-p>This is an Open Access article distributed under the terms of the Creative Commons Attribution-NonCommercial 4.0 International License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by-nc/4.0/">http://creativecommons.org/licenses/by-nc/4.0/</ext-link>), permitting all non-commercial use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<abstract xml:lang="en">
<title>Abstract</title>
<p><bold>Introduction.</bold> A vast amount of scholarly work is published daily, yet much of it remains inaccessible to the general public due to dense jargon and complex language. We introduce a reinforcement learning approach that fine-tunes a language model to rewrite scholarly abstracts into more comprehensible versions.</p>
<p><bold>Method.</bold> Our approach utilises a carefully balanced combination of word- and sentence-level accessibility rewards to guide the language model in substituting technical terms with more accessible alternatives, a task which models supervised fine-tuned or guided by conventional readability measures struggle to accomplish.</p>
<p><bold>Analysis.</bold> We evaluate our model&#x2019;s performance through readability metrics, factual accuracy assessments and language quality measurements, comparing results against supervised fine-tuning baselines.</p>
<p><bold>Results.</bold> Our best model adjusts the readability level of scholarly abstracts by approximately six US grade levels&#x2014;in other words, from a postgraduate to a high school level. This translates to roughly a 90% relative improvement over the supervised fine-tuning baseline, while maintaining factual accuracy and high- quality language.</p>
<p><bold>Conclusion.</bold> We envision our work as a step toward bridging the gap between scholarly research and the general public, particularly younger readers, and those without a college degree.</p>
</abstract>
</article-meta>
</front>
<body>
<sec id="sec1">
<title>Introduction</title>
<sec id="sec1_1">
<title>Accessible language in science communication</title>
<p>At first glance, the daily publication of tens of thousands of scientific papers&#x2014;many freely accessible through open science and open access initiatives&#x2014;suggests few barriers to knowledge dissemination. However, two key facts challenge this perception and show that significant barriers remain. A recent survey by the US Department of Education found that more than half of US adults aged 16 to 74 (54%, or 130 million people) read at or below a sixth-grade level (<xref rid="R20" ref-type="bibr">Rothwell, 2020</xref>). Meanwhile, an analysis of the readability of biomedical research abstracts published from 1881 to 2015 found that scientific writing has become increasingly difficult to read over time (<xref rid="R16" ref-type="bibr">Plav&#x00E9;n-Sigray et al., 2017</xref>). Even when intended to be accessible, scientific abstracts typically require a postgraduate level of reading comprehension due to jargon use and sentence structure (<xref rid="R24" ref-type="bibr">Wang and Clark, 2024</xref>). This discrepancy leaves a significant portion of the population&#x2014;including young readers and adults without advanced degrees unable to fully engage with scientific works, even if these are made freely available online. The <italic>&#x2018;infodemic&#x2019;</italic> surrounding COVID-19 highlighted this issue: the urgent need for understandable information about the virus clashed with the complex presentation of scientific findings&#x2014;leading many to turn to more digestible but less reliable narratives on social media (<xref rid="R25" ref-type="bibr">Wang et al., 2019</xref>; <xref rid="R7" ref-type="bibr">Islam et al., 2020</xref>; <xref rid="R2" ref-type="bibr">Calleja et al., 2021</xref>).</p>
<p>While the legal and medical fields have long been encouraged to use accessible language as a clear conduit for public engagement (<xref rid="R12" ref-type="bibr">Mazur, 2000</xref>; <xref rid="R15" ref-type="bibr">Petelin, 2010</xref>), momentum for the adoption of accessible language within scientific communities has been building, roughly since the start of the open science movement (<xref rid="R21" ref-type="bibr">Schriver, 2017</xref>). For instance, the National Institutes of Health (NIH) advocates for &#x2018;clear and simple&#x2019; principles when communicating with audiences with limited health literacy, and the <italic>Proceedings of the National Academy of Sciences of the United States of America</italic> (<italic>PNAS</italic>) requires authors to submit a significance statement accessible to non-experts (<xref rid="R1" ref-type="bibr">Berenbaum, 2021</xref>; <xref rid="R17" ref-type="bibr">Pool et al., 2021</xref>). However, there are inherent conflicts between the specialised nature of communication among disciplinary peer scholars and the public-oriented dissemination of scientific findings. Even assuming that communicating scholarly works in plain language is possible, it will inevitably increase the communication cost among domain experts and create confusion at the more advanced levels, compared to the use of jargon and technical terms. In an era of increasingly specialized scientific research, this conundrum is not easily addressed by scientists or disseminators. Hence, given the current landscape, the widespread adoption of accessible language in scholarly works is unlikely in the near future.</p>
<p>In response, we propose addressing the need for communicating scientific findings to a broader audience by <italic>rewriting scholarly abstracts with simpler words and grammar using language models.</italic> Since readability is key to comprehending scholarship (<xref rid="R4" ref-type="bibr">Flesch, 1946</xref>; <xref rid="R3" ref-type="bibr">DuBay, 2004</xref>; <xref rid="R9" ref-type="bibr">Kerwer et al., 2021</xref>), we envision the resulting accessible narratives as paving the way for the &#x2018;last mile&#x2019; of science, broadening access to scientific understanding and engagement, especially for younger readers and those without a university degree.</p>
</sec>
<sec id="sec1_2">
<title>Challenges to effective simplification</title>
<p>Fine-tuning a language model using pairs of abstracts and their accessible versions is the <italic>de facto</italic> method for automating the rewriting of scholarly abstracts into more accessible versions (<xref rid="R27" ref-type="bibr">Xu et al., 2015</xref>; <xref rid="R5" ref-type="bibr">Goldsack et al., 2022</xref>; <xref rid="R8" ref-type="bibr">Joseph et al., 2023</xref>). Accordingly, we introduced the Scientific Abstract-Significance Statement (SASS) corpus (<xref rid="R24" ref-type="bibr">Wang &#x0026; Clark, 2024</xref>), a dataset composed of paired abstracts and significance statements from diverse disciplines, with the latter targeting &#x2018;an undergraduate-educated scientist outside their field of specialty&#x2019; (<xref rid="R1" ref-type="bibr">Berenbaum, 2021</xref>; <xref rid="R17" ref-type="bibr">Pool et al., 2021</xref>). Although the simplified abstracts generated from language models fine-tuned on the SASS corpus are approximately three grade levels more readable than the original abstracts, as measured by US grade-based readability scores (<xref rid="R24" ref-type="bibr">Wang &#x0026; Clark, 2024</xref>, Sec. 6), the documents are still not sufficiently accessible; even the best models produce college-level texts. Additionally, because the vocabulary used in significance statements is often just as complex as that found in the abstracts themselves (<xref rid="R24" ref-type="bibr">Wang &#x0026; Clark, 2024</xref>, Sec. 3), the readability improvements are primarily due to shorter sentences, and technical terms remain inadequately addressed.</p>
<p>Alternatively, the optimisation of a language model can be guided by a chosen objective in an actor-critic manner (<xref rid="R18" ref-type="bibr">Ramamurthy et al., 2023</xref>). It is intuitive to choose an established document readability measure, such as the Automated Readability Index (ARI; see Section 4.2.2), to assess the overall readability of the outputs generated by the language model. However, we found that the optimisation of language models guided by ARI is highly unstable, often resulting in the production of seemingly more accessible versions that still contain many technical terms. Inspired by Riddell and Igarashi (<xref rid="R19" ref-type="bibr">2021</xref>), we decomposed the measurement of document readability into two distinct measures: one at the sentence level and one at the word level. We then prioritized word-level accessibility in the optimisation to encourage the model to use more accessible words instead of simply shortening sentences.</p>
</sec>
<sec id="sec1_3">
<title>Contribution</title>
<p>Our work aims to serve as a bridge between scholarly works and the general public, particularly benefiting younger readers and those without a college degree.</p>
<list list-type="order">
<list-item><p>We address the common challenges in science communication by rewriting scholarly abstracts at a high school reading level using a language model.</p></list-item>
<list-item><p>We identify the challenges language models face in properly addressing jargon and propose Reinforcement Learning from Accessibility Measures (RLAM) as a means to improve the models&#x2019; use of accessible terms in their rewrites. RLAM- trained language models can significantly reduce the reading level of a scholarly abstract from a postgraduate level to a high school level, achieving a 3 grade-level reduction or about a 90% performance boost compared to models fine-tuned using the same corpus.</p></list-item>
<list-item><p>We observe systematic differences between reinforcement learning models guided by different rewards and conclude that disproportionate weights for sentence-level rewards contribute to unstable training and lower simplification quality.</p></list-item>
</list>
<p>Our code, model generations and training logs are available at https://github.com/ Wang- Haining/RLAM under a permissive licence.</p>
</sec>
<sec id="sec1_4">
<title>Scientific abstract-significance statement (SASS) corpus</title>
<p>We used the scientific abstract significance statement (SASS) corpus in our experiments. This corpus is composed of 3,430 abstract-significance statement pairs derived from <italic>PNAS</italic> and divided into training (3,030 samples), validation (200 samples), and test sets (200 samples) (<xref rid="R24" ref-type="bibr">Wang &#x0026; Clark, 2024</xref>). It covers a wide range of disciplines, ensuring diverse representation across various fields, as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. Corpus statistics are shown in <xref ref-type="table" rid="T1">Table 1</xref>; refer to Section 4.2 for a detailed description of the measures.</p>
<table-wrap id="T1">
<label>Table 1.</label>
<caption><p>Corpus statistics for the scientific abstract-significance statement (SASS) corpus. Metrics include ARI (Automated Readability Index), F-K (Flesch-Kincaid readability test), VOA (log ratio of proportion of words found in the VOA1500 vocabulary), SL (average sentence length and number of sentences), WA (word accessibility; log frequency per 1 billion tokens in English Wikipedia), and WL (average word length). Measures whose names are followed by a down arrow symbol (<italic>&#x2193;</italic>) indicate that lower values correspond to a more readable document. Numeric values in parentheses are the corresponding standard deviations. Paired t-tests were conducted for each metric comparing the abstracts and significance statements, with p-values adjusted using the Bonferroni correction for multiple comparisons. The observed differences in each of the measurements are statistically significant after adjusting for the grouped p-values at a significance level of 0.05.</p></caption>
<table>
<thead>
<tr>
<th align="left" valign="top">Section</th>
<th align="center" valign="top">ARI<italic>&#x2193;</italic></th>
<th align="center" valign="top">F-K<italic>&#x2193;</italic></th>
<th align="center" valign="top">VOA</th>
<th align="center" valign="top">SL<italic>&#x2193;</italic></th>
<th align="center" valign="top">WA</th>
<th align="center" valign="top">WL<italic>&#x2193;</italic></th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Abstract</td>
<td align="center" valign="top">18.9<sub>(2.8)</sub></td>
<td align="center" valign="top">19.2<sub>(2.4)</sub></td>
<td align="center" valign="top">-0.43<sub>(0.25)</sub></td>
<td align="center" valign="top">25.4<sub>(4.9)</sub></td>
<td align="center" valign="top">12.0<sub>(0.4)</sub></td>
<td align="center" valign="top">5.3<sub>(0.4)</sub></td>
</tr>
<tr>
<td align="left" valign="top">Significance</td>
<td align="center" valign="top">18.1*<sub>(3.1)</sub></td>
<td align="center" valign="top">18.6*<sub>(2.7)</sub></td>
<td align="center" valign="top">-0.31*<sub>(0.26)</sub></td>
<td align="center" valign="top">23.9*<sub>(5.3)</sub></td>
<td align="center" valign="top">11.9*<sub>(0.4)</sub></td>
<td align="center" valign="top">5.4*<sub>(0.4)</sub></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We observed that significance statements are semantically coherent with their corresponding abstracts. The corpus statistics indicate that significance statements are more readable than abstracts, as shown by lower mean values in the Automated Readability Index (ARI) and Flesch-Kincaid readability test (F-K). This suggests that the SASS corpus can be useful in simplifying scholarly abstracts across diverse disciplines.</p>
<fig id="F1">
<label>Figure 1.</label>
<caption><p>Discipline and readability distributions of abstracts and significance statements found in the training set of the Scientific Abstract-Significance Statement corpus. The count of paired samples in different disciplines is shown in blue bars on a log10 scale (disciplines with fewer than three samples are not shown). Readability is measured using the Automated Readability Index (ARI), which estimates the number of years of schooling required to understand a text. On average, abstracts have a readability slightly below 20 ARI, indicating a post-graduate level. Significance statements are generally more readable than their corresponding abstracts.</p></caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="images\c18-fig1.jpg"><alt-text>none</alt-text></graphic>
</fig>
<p>We also observed that word accessibility (i.e., log frequency per 1 billion tokens found in English Wikipedia) and average word length suggest that significance statements can be less accessible at the word level than are their corresponding abstracts. Although the log ratio of words found in the VOA1500 vocabulary is slightly lower than in the corresponding abstracts, these 1,500 words are very basic and include a high proportion of function words. Considering that significance statements use approximately 1.5 fewer words on average, the increased use of VOA words may be a consequence of the higher use of function words to maintain grammaticality.</p>
</sec>
</sec>
<sec id="sec2">
<title>Reinforcement learning from accessibility measures</title>
<sec id="sec2_1">
<title>Language modelling via proximal policy optimisation</title>
<p>At the core of our approach is language modelling with Proximal Policy Optimisation (PPO) (<xref rid="R22" ref-type="bibr">Schulman et al., 2017</xref>) guided by two accessibility measures. A causal language model trained on large corpora can generate the next token based on the current sequence, which is useful in the context of reinforcement learning for developing a policy model that determines the most appropriate next token to maximize the expected return in terms of document readability.</p>
<p>The process begins with an input sequence s0 = (a0, a1,..., ai), where each ai is from</p>
<p>a set of tokens <italic>W</italic>, and <italic>s</italic>0 represents an abstract formatted in a simple template. The language model <italic>&#x03C0;<sub>&#x03B8;</sub></italic> then generates <italic>a</italic><sub>0</sub><italic>, a</italic><sub>1</sub><italic>,..., aT<sub>-</sub></italic><sub>1</sub> <italic>&#x223C; &#x03C0;<sub>&#x03B8;</sub></italic> (&#x00B7;<italic> | st</italic>), creating its accessible version until the maximum number of tokens <italic>T</italic> is reached, either due to the context length or an end-of-sentence token:</p>
<disp-formula><label>(1)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mn>...</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x220F;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mrow><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mstyle></mml:mrow></mml:math></disp-formula>
<p>Our objective is to learn a policy model that, given an abstract, models the joint probability of tokens leading to a high reward in terms of accessibility while maintaining semantic coherence. Formally, this is expressed as:</p>
<disp-formula><label>(2)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:mi>J</mml:mi><mml:mfenced><mml:mrow><mml:mi>&#x03C0;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi><mml:mo>~</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mfenced close="]" open="["><mml:mrow><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mrow><mml:mfenced><mml:mrow><mml:mi>r</mml:mi><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow></mml:msub><mml:mtext>KL</mml:mtext><mml:mfenced><mml:mrow><mml:mi>&#x03C0;</mml:mi><mml:mi>&#x03B8;</mml:mi><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x007C;</mml:mo><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi><mml:mi>S</mml:mi><mml:mi>F</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mstyle></mml:mrow></mml:mfenced></mml:mrow></mml:math></disp-formula>
<p>Here, <italic>J</italic>(<italic>&#x03C0;<sub>&#x03B8;</sub></italic>) represents the expected return when following policy <italic>&#x03C0;<sub>&#x03B8;</sub></italic>. The reward <italic>r</italic>(<italic>s<sub>t</sub>, at</italic>) is estimated for each time step <italic>t</italic> in the trajectory <italic>&#x03C4;</italic> = (<italic>s</italic><sub>0</sub><italic>,a</italic><sub>0</sub><italic>,s</italic><sub>1</sub><italic>,a</italic><sub>1</sub><italic>,..., sT<sub>-</sub></italic><sub>1</sub><italic>,a<sub>T-</sub></italic><sub>1</sub>), where <italic>s<sub>t</sub></italic> is the sequence of tokens at time <italic>t</italic>, formed as the concatenation of <italic>s</italic><sub>0</sub> and the tokens <italic>a</italic><sub>0</sub><italic>,a</italic><sub>1</sub><italic>,...,a<sub>t-</sub></italic><sub>1</sub>. This formula iteratively computes the rewards given the current sequence <italic>s<sub>t</sub></italic> and the token <italic>a<sub>t</sub></italic> chosen by the policy. The <italic>&#x03B2;</italic><sub>KL</sub>- weighted KL divergence term KL(<italic>&#x03C0;<sub>&#x03B8;</sub></italic>(<italic>a<sub>t</sub> | s<sub>t</sub></italic>) <italic>&#x2225; &#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub>(<italic>a<sub>t</sub> | s<sub>t</sub></italic>)) is applied at every step of sequence generation to ensure the policy does not deviate significantly from the supervised fine-tuned model. This is crucial because, without such a constraint, the policy model might quickly learn to output whatever the reward model favours to maximize its return, which can lead to undesirable behaviours. For instance, the model might repeatedly output a frequent word (&#x2018;is is is ...&#x2019;), achieving a high reward based solely on accessibility measures but lacking meaningful content. Following Stiennon et al. (2020), <italic>&#x03B2;</italic><sub>KL</sub> is dynamically adjusted by targeting a specific KL divergence between <italic>&#x03C0;<sub>&#x03B8;</sub></italic> and <italic>&#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub> using a capped proportional controller in logarithmic space. The benefit of using a dynamic KL control, as opposed to a fixed one, is that it allows the model to adapt more flexibly to different stages of training, accommodating varying levels of KL divergence between the policy and SFT model.</p>
<p>In practice, we fine-tune the policy model in an actor-critic manner: while the policy model (the actor) generates sequences of tokens based on the current sequence <italic>s<sub>t</sub></italic>, the critic is an additional linear layer that takes the output of the language model&#x2019;s last layer and produces a scalar for time step <italic>t</italic>, estimating the expected cumulative reward of producing the token <italic>a<sub>t</sub></italic>, noted as <italic>V<sub>&#x03C1;</sub></italic>(<italic>a<sub>t</sub></italic>). The problem is reduced to optimising at every step the expected cumulative reward of taking action <italic>a<sub>t</sub></italic> in state <italic>s<sub>t</sub></italic> and following the policy <italic>&#x03C0;<sub>&#x03B8;</sub></italic> thereafter (<italic>V<sub>&#x03C1;</sub></italic>(<italic>s<sub>t</sub></italic><sub>+1</sub>) = <italic>V<sub>&#x03C1;</sub></italic>(<italic>s<sub>t</sub>,a<sub>t</sub></italic>)), compared to the expected cumulative reward of being in state <italic>s</italic> (<italic>V<sub>&#x03C1;</sub></italic>(<italic>s<sub>t</sub></italic>)), termed as the advantage <italic>d<sub>t</sub></italic> = <italic>V<sub>&#x03C1;</sub></italic>(<italic>s<sub>t</sub></italic><sub>+1</sub>) <italic>- V<sub>&#x03C1;</sub></italic>(<italic>s<sub>t</sub></italic>).</p>
<p>We used the final reward of the entire generation (see Section 3.3) and back-propagate it through the sequence using Temporal Difference (TD) and Generalised Advantage Estimation (GAE) to estimate the advantage of each token:</p>
<disp-formula><label>(3)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0027;</mml:mo><mml:mo>=</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mrow><mml:msup><mml:mrow><mml:mfenced><mml:mrow><mml:mi>&#x03B3;</mml:mi><mml:mi>&#x03BB;</mml:mi></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0027;</mml:mo></mml:mrow></mml:msup><mml:mfenced><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>T</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x03B3;</mml:mi><mml:msub><mml:mi>V</mml:mi><mml:mi>&#x03C1;</mml:mi></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0027;</mml:mo><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>&#x03C1;</mml:mi></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0027;</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mstyle></mml:mrow></mml:math></disp-formula>
<p>where <italic>&#x03B3;</italic> is the discount factor for future rewards, <italic>&#x03BB;</italic> controls the bias-variance trade-off, and <italic>r<sub>T</sub></italic> is the final reward, which, in our case, is a linear combination of two accessibility measures. <italic>V<sub>&#x03C1;</sub></italic> is trained by minimizing the square error loss; see Equation 5.</p>
<p>We used the Proximal Policy Optimisation (PPO) (<xref rid="R22" ref-type="bibr">Schulman et al., 2017</xref>) clipped surrogate objective with importance sampling to more efficiently use offline samples to update the online policy. Importance sampling corrects for the discrepancy between the behaviour policy that generated the samples and the current policy by weighting the samples using the ratio of their probabilities under both policies. The PPO algorithm introduces a clipping mechanism to balance exploration and exploitation while preventing large, potentially harmful updates to the policy, see Equation 4. The whole RLAM algorithm is illustrated in Algorithm 1.</p>
<p><bold>Algorithm 1</bold> Training with Reinforcement Learning from Uncombined Accessibility Measures. The policy model is updated using the PPO clipped surrogate objective (Eq. 4), and the value model is updated by minimising a square-error objective (Eq. 5).</p>
<list list-type="order">
<list-item><p><bold>Input:</bold> initial policy model <italic>&#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub>, randomly initiated value head <italic>V<sub>&#x03C1;</sub></italic><sub>init</sub>, final reward function <italic>r<sub>T</sub></italic> for the last token, weighted by <italic>&#x03B2;</italic><sub>KL</sub> for KL divergence term; task prompts <italic>X</italic>; hyperparameters <italic>&#x03B3;,&#x03BB;,&#x223C;</italic></p></list-item>
<list-item><p><italic>&#x03C0;&#x03B8; &#x2190; &#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub>, <italic>V<sub>&#x03C1;</sub> &#x2190; V<sub>&#x03C1;</sub></italic><sub>init</sub></p></list-item>
<list-item><p><bold>for</bold> step = 1<italic>,...,</italic> M<bold> do</bold></p></list-item>
<list-item><p>Sample a batch {s<sub>0</sub>}<sup>n</sup> from <italic>X</italic></p></list-item>
<list-item><p>Sample output sequences <italic>{a</italic><sub>0</sub><italic>, a</italic><sub>1</sub><italic>,..., a<sub>T-</sub></italic><sub>1</sub><italic>}<sup>n</sup> &#x223C; &#x03C0;<sub>&#x03B8;</sub></italic> (<italic>&#x00B7; | s</italic><sub>0</sub>) for each prompt <italic>s</italic><sub>0</sub> in the batch &#x22B3; Eq. 1</p></list-item>
<list-item><p>Compute final reward <italic>r<sub>T</sub></italic> for each sampled output sequence <italic>{a</italic><sub>0</sub><italic>, a</italic><sub>1</sub><italic>,...,a<sub>T-</sub></italic><sub>1</sub><italic>}<sup>n.</sup></italic> &#x22B3;Sec. 3.3</p></list-item>
<list-item><p>Distribute the final reward <italic>r<sub>T</sub></italic> to each token in the sequence through GAE &#x22B3; Eq. 3</p></list-item>
<list-item><p>Compute advantages <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="inline"><mml:mrow><mml:msubsup><mml:mrow><mml:mfenced close="}" open="{"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, value targets <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="inline"><mml:mrow><mml:msubsup><mml:mrow><mml:mfenced close="}" open="{"><mml:mrow><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>arg</mml:mi></mml:mrow></mml:msub><mml:mo>&#x007C;</mml:mo><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> for each sequence with <italic>V<sub>&#x03C1;</sub></italic> and compute KL divergence penalty KL<sub><italic>t</italic></sub> = KL (<italic>&#x03C0;<sub>&#x03B8;</sub></italic> (<italic>a<sub>t</sub> | s<sub>t</sub></italic>) <italic>&#x2225; &#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub> (<italic>a<sub>t</sub> | s<sub>t</sub></italic>))</p></list-item>
<list-item><p><bold>for</bold> PPO iteration = 1<italic>,..., &#x00B5;</italic><bold> do</bold></p></list-item>
<list-item><p>Update the policy model by maximizing the PPO clipped surrogate objective with KL penalty:
<disp-formula><label>(4)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:mi>&#x03B8;</mml:mi><mml:mo>&#x2190;</mml:mo><mml:mi>arg</mml:mi><mml:mtext>&#x2009;</mml:mtext><mml:munder><mml:mrow><mml:mi>max</mml:mi></mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:munder><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>t</mml:mi></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mrow><mml:mi>min</mml:mi><mml:mfenced><mml:mrow><mml:mfrac><mml:mrow><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>online</mml:mtext></mml:mrow></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>offline</mml:mtext></mml:mrow></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x007C;</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfrac><mml:mfenced><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:mrow></mml:math></disp-formula></p></list-item>
<list-item><p><bold>end for</bold></p></list-item>
<list-item><p>Update the value model by minimizing a square-error objective:
<disp-formula><label>(5)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:mi>&#x03C1;</mml:mi><mml:mo>&#x2190;</mml:mo><mml:mi>arg</mml:mi><mml:mtext>&#x2009;</mml:mtext><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mi>&#x03C1;</mml:mi></mml:munder><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mrow><mml:msup><mml:mrow><mml:mfenced><mml:mrow><mml:msub><mml:mi>V</mml:mi><mml:mi>&#x03C1;</mml:mi></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mtext>targ</mml:mtext></mml:mrow></mml:msub><mml:mfenced><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:mrow></mml:math></disp-formula></p></list-item>
<list-item><p><bold>end for</bold></p></list-item>
<list-item><p><bold>Output:</bold> <italic>&#x03C0;<sub>&#x03B8;</sub></italic></p></list-item>
</list>
</sec>
<sec id="sec2_2">
<title>Adaptive Kullback&#x2013;Leibler controller</title>
<p>Following Ziegler et al. (2019, Sec. 2.2), we dynamically adjust <italic>&#x03B2;</italic><sub>KL</sub> to target a specific KL divergence value, KL<sub>target</sub>, using a log-space proportional controller.</p>
<p>The update rule is:</p>
<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="block"><mml:mrow><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:msub><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:msub><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mfenced><mml:mrow><mml:mfenced><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>&#x03B2;</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mtext>clip</mml:mtext><mml:mfenced><mml:mrow><mml:mfrac><mml:mrow><mml:mtext>KL</mml:mtext><mml:mfenced><mml:mrow><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mtext>SFT</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow><mml:mrow><mml:mtext>target</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>target</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:math></disp-formula>
<p>where <italic>K<sub>&#x03B2;</sub></italic> is the proportional gain, set to 0<italic>.</italic>01.</p>
</sec>
<sec id="sec2_3">
<title>Reward function</title>
<p>The reward function evaluates the overall quality of the output (<italic>s<sub>T</sub></italic>). After the initial failures of testing a traditional readability measure (i.e., ARI) as the criterion, we decided to use a balance of two accessibility measures: average sentence length in words and word accessibility, adopted from Riddell and Igarashi (<xref rid="R19" ref-type="bibr">2021</xref>), to guide the optimisation.</p>
<p><bold>Word accessibility reward</bold> A word&#x2019;s accessibility is approximated by how frequently it appears in a large reference corpus. We chose the English Wikipedia corpus due to its domain similarity and applied a Moses tokenizer, yielding a vocabulary of 14.6 million types from a total of 3.6 billion tokens. If a token is among the most common 100,000 types, we report its frequency per billion tokens as its accessibility measure. Otherwise, we estimate its frequency using ridge regression with an <italic>l</italic>2-norm coefficient equal to 1<italic>.</italic>0. This model allows us to make serviceable estimates of the frequency of arbitrary tokens, including tokens that do not appear in the reference corpus. This model takes as input the token&#x2019;s length in Unicode code points, its byte unigrams, byte bigrams, and byte trigrams. The model estimates the token&#x2019;s log frequency per 1 billion tokens. We used the natural logarithm of frequencies per billion tokens as the measure of word accessibility. For example, the accessibility score for &#x2018;big&#x2019; is 11.8, while &#x2018;colossal&#x2019; scores 7.3. Despite being comparable in meaning, the model&#x2019;s production of the latter will receive fewer rewards. Coefficient <italic>&#x03B2;</italic><sub>wa</sub> is to control the scale of the credit given for word accessibility.</p>
<p>We have faithfully followed the experiment of Riddell and Igarashi (<xref rid="R19" ref-type="bibr">2021</xref>) with three differences. First, our reference corpus is the English Wikipedia, whereas the original study used the Common Crawl News corpus. Second, we did not discard duplicated sentences as Riddell and Igarashi (<xref rid="R19" ref-type="bibr">2021</xref>) did, because we found that sentence duplication is not common in Wikipedia. Third, the original study reported word <italic>inaccessibility</italic> scores by negating the logarithm of frequency per billion. We report <italic>accessibility</italic>, without negation, because it is more naturally suited to serve as a reward. Refer to Riddell and Igarashi (<xref rid="R19" ref-type="bibr">2021</xref>, pp. 1186&#x2013;1187) for the training of the ridge regression.</p>
<p><bold>Sentence length reward</bold> Sentence length is also determined by a Moses tokeniser, which preserves hyphenation and splits contractions. For the Moses rule-based tokeniser, we use the <italic>sacremoses</italic> Python package. We negate the value of sentence length for intuitive calculation of the rewards for optimisation.</p>
</sec>
</sec>
<sec id="sec3">
<title>Experiment setup</title>
<sec id="sec3_1">
<title>Training</title>
<p>We initialised the policy models <italic>&#x03C0;<sub>&#x03B8;</sub></italic> by adopting the Gemma-2B checkpoint reported in Wang and Clark (<xref rid="R24" ref-type="bibr">2024</xref>) (<italic>&#x03C0;<sub>&#x03B8;</sub></italic><sub>SFT</sub>). The original Gemma-2B was trained on three trillion tokens, consisting of publicly available data as well as proprietary datasets comprising &#x2018;primarily English data from web documents&#x2019; (<xref rid="R13" ref-type="bibr">Mesnard et al., 2024</xref>). The specific checkpoint we adopted was fine-tuned using the SASS corpus in a straightforward manner. It was chosen for its strong performance for simplification quality, faithfulness, and relatively compact size.</p>
<p>The two accessibility rewards were weighted as follows: the word accessibility reward was set to <italic>&#x03B2;</italic><sub>WA</sub> = 4<italic>.</italic>0, and we report two models with different <italic>&#x03B2;</italic><sub>SL</sub> values of 0<italic>.</italic>05 and 0<italic>.</italic>2. Because word accessibility is in logarithmic space, we subtracted 10 from the average word accessibility of the output and reset any values lower than 10 to 0 to keep them within a reasonable range. For adaptive control of the per-token semantic reward, we started with an initial <italic>&#x03B2;</italic><sub>KL</sub> = 0<italic>.</italic>2 and targeted a KL divergence of 8<italic>.</italic>0 nats during the training course, capping it in the range between 0<italic>.</italic>15 and 0<italic>.</italic>25. We used a micro batch size of 4, with each sequence used to run the PPO algorithm for 4 epochs using importance sampling, with gradient accumulation steps set to 4. We used a clip range of 0<italic>.</italic>2 for the policy gradient and value function estimation to ensure stability. The value function coefficient was set to 0<italic>.</italic>1. The optimisation used standard AdamW optimiser parameters (<italic>&#x03B2;</italic><sub>1</sub> = 0<italic>.</italic>9, <italic>&#x03B2;</italic><sub>2</sub> = 0<italic>.</italic>999, <italic>&#x223C;</italic> = 1 <italic>&#x00D7;</italic> 10<sup><italic>-</italic>8</sup>) (<xref rid="R10" ref-type="bibr">Kingma and Ba, 2015</xref>; <xref rid="R11" ref-type="bibr">Loshchilov and Hutter, 2017</xref>). The learning rate was fixed at 1<italic>&#x00D7;</italic>10<sup><italic>-</italic>6</sup>. The training was conducted on 2 H100 (80GB) GPUs using mixed precision training in bfloat16. The sampling temperature was set to 0<italic>.</italic>7 in the rollout phase. Following Huang et al. (2024), we assigned unfinished rollouts (indicated by the missing end of sequence token) with a fixed low score to encourage the model to generate complete simplified narratives. We also report a model trained as dictated by ARI (i.e., RLARI; described in Section 1.2), with all other parameters kept the same as those guided by the accessibility measures. We tested other hyperparameters for guiding the optimisation of RLARI in pilot studies but obtained similar results.</p>
<p>We performed multiple runs for each reinforcement learning process and selected the checkpoint to report based on a balance of semantic retention and ARI score obtained on the validation set. We observed that the readability of the generated text on the validation set often began to decline rapidly after plateauing for a while, typically accompanied by a surge in the standard deviation of average word accessibility among sentences. This signals that the language model was sacrificing language quality and semantic relevance for extra improvements in readability. Therefore, we reported the checkpoint immediately before such instability occurred.</p>
</sec>
<sec id="sec3_2">
<title>Evaluation</title>
<p>We evaluated the simplified texts generated by the language models trained with the reinforcement learning framework using 200 abstracts from the SASS corpus test set for simplification. Though advanced decoding methods might further refine the quality of the outputs, we used multinomial sampling with the temperature set to zero to intentionally produce the most deterministic outputs. This approach helped us better understand the modelling of accessible language and made it easier for us to identify potential quirks. We assessed the quality of the generated simplified abstracts both quantitatively and qualitatively. Quantitatively, we measured the generated texts based on their semantic retention and accessibility using established relevance measures as well as readability and accessibility metrics.</p>
</sec>
<sec id="sec3_3">
<title>Semantic retention</title>
<p>BERTScore calculates the cosine similarity between each token in the candidate sentence and each token in the reference sentence using contextual embeddings from a pre-trained language model; its results align well with human judgement on semantic similarity evaluation (<xref rid="R29" ref-type="bibr">Zhang et al., 2019</xref>). It is not directly influenced by lexical overlap, making it more suitable for evaluating simplification systems than are metrics that rely on matching words, such as BLEU (<xref rid="R14" ref-type="bibr">Papineni et al., 2002</xref>). For our evaluation, we used embeddings from the 18th layer of a BERT-large-uncased model and reported the F1 score. This choice is based on prior findings indicating that the 18th layer yields a strong Pearson correlation (0.72) on the WMT16 To-English benchmark (<xref rid="R29" ref-type="bibr">Zhang et al., 2019</xref>).</p>
</sec>
<sec id="sec3_4">
<title>Simplification and accessibility</title>
<p>Accessibility can be measured with respect to the overall simplification quality (SARI); readability (ARI and Flesch-Kincaid); and other straightforward document complexity measures, including average sentence length, word accessibility (i.e., log frequency per 1 billion tokens found in English Wikipedia), the log ratio of its proportion of VOA Special English words (1,517 types in total), and average word length.</p>
<p>SARI (System output Against References and against the Input sentence) is specifically designed to evaluate text simplification (<xref rid="R28" ref-type="bibr">Xu et al., 2016</xref>). It aims to measure how well a simplified text retains the original meaning while improving readability. SARI provides a balanced measure of how well a text simplification system performs by focusing on the necessary operations of adding, deleting, and retaining words.</p>
<p>ARI and Flesch-Kincaid readability tests assign a numerical score to text that reflects the US grade level required for comprehension. Lower scores (1&#x2013;13) indicate content suitable for kindergarten through twelfth grade, with each score corresponding to a subsequent grade level. Scores in the range of 14&#x2013;18 suggest college-level readability, ranging from first- to senior-year content appropriateness. Higher scores (19 and above) are associated with advanced college education. Both measures use average sentence length. Flesch-Kincaid uses syllables per word, while ARI uses characters per word for its linear combination with sentence length.</p>
<p>We harvested VOA Special English vocabulary comprising 1,517 unique words (VOA1500). We included VOA Special English Word Book Sections A-Z, Science Programs, and Organs of the Body hosted on Wikipedia (<xref rid="R26" ref-type="bibr">Wikimedia Foundation, 2024</xref>). We calculate the ratio of words that appear in the VOA1500 to those that do not, then report the natural logarithm of this ratio for each generated sample. Values above 0 indicate that the text contains more Special English words than non-Special English words, and a higher value indicates a greater presence of &#x2018;easy&#x2019; words.</p>
</sec>
<sec id="sec3_5">
<title>Language quality, faithfulness, and completeness</title>
<p>We manually examined 5% of all generated samples, corresponding to a randomly chosen subset of the test set from the SASS corpus. Each generation is annotated with respect to language quality, faithfulness, and completeness using a rubric of Good, Acceptable, and Poor. We focused on fluency and grammaticality and hand-picked both good and problematic examples when evaluating language quality. For the evaluation of faithfulness, we conduct close readings to assess the extent to which a simplified abstract remains factually faithful to the original narrative. If uncertainty arises, we consult the corresponding manuscript, as our abstract simplification system must avoid producing misinformation. Completeness is also a key consideration, as it is essential to include the main findings and implications of the research for the general public, since this is the primary goal of scientific dissemination.</p>
</sec>
</sec>
<sec id="sec4">
<title>Findings and discussion</title>
<sec id="sec4_1">
<title>Quantitative assessment</title>
<p><xref ref-type="table" rid="T2">Table 2</xref> summarises the performance of Gemma-2B, tuned in different ways, when evaluated on the test set of the SASS corpus. The first scenario is the supervised fine-tuned baseline (SFT), which performs next-token prediction on the SASS corpus training set.</p>
<p>The second and third scenarios are reinforcement learning through PPO guided by ARI (RLARI) or accessibility measures (RLAM). We assessed the generation quality by considering both semantic retention and simplification, specifically using BERT score (BS), SARI, ARI, Flesch-Kincaid readability test (F-K), the log ratio of words in the VOA1500 vocabulary (VOA), sentence length (SL), word accessibility (WA), and word length (WL). A one-tailed paired t-test was conducted for each metric to compare observations between the reinforcement learning and supervised fine-tuning baselines, assuming improvement in document readability. Bonferroni correction was applied to each set of tests to maintain a family-wise significance level of 0.05.</p>
<table-wrap id="T2">
<label>Table 2.</label>
<caption><p>Comparison of Gemma-2B&#x2019;s performance across three approaches: the supervised fine-tuned baseline (SFT), reinforcement learning guided by ARI (using an intermediate checkpoint before significant policy gradient instability was observed, RLARI), and reinforcement learning guided by two accessibility measures (RLAM). SFT was fine-tuned using the Scientific Abstract-Significance Statement (SASS) corpus. The columns labelled <italic>&#x03B2;</italic><sub>WA</sub> and <italic>&#x03B2;</italic><sub>SL</sub> pertain specifically to RLAM, where the rewards for average word accessibility and sentence length are balanced. The inference on the test split from SASS uses multinomial sampling. Metrics ARI, F-K, SARI, VOA, SL, WA, WL, and BS stand for Automated Readability Index, Flesch-Kincaid readability test, log ratio of VOA1500 vocabulary, sentence length, word accessibility, word length, and BERTScore (F1), respectively. Measures followed by a down arrow symbol (<italic>&#x2193;</italic>) indicate that lower values are better. Numeric values in parentheses are the corresponding standard deviations. A paired two-tailed t-test was performed on observations of each measure between each model and the original abstracts. At a model-wise p-value of 0.05, measures that differ significantly from the SFT baseline are marked with an asterisk.</p></caption>
<table>
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top"><italic>&#x03B2;</italic>SL</th>
<th align="center" valign="top"><italic>&#x03B2;</italic><sub>WA</sub></th>
<th align="center" valign="top">ARI<italic>&#x2193;</italic></th>
<th align="center" valign="top">F-K<italic>&#x2193;</italic></th>
<th align="center" valign="top">SARI</th>
<th align="center" valign="top">VOA</th>
<th align="center" valign="top">SL<italic>&#x2193;</italic></th>
<th align="center" valign="top">WA</th>
<th align="center" valign="top">WL<italic>&#x2193;</italic></th>
<th align="center" valign="top">BS</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">SFT</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">15.5 (3.0)</td>
<td align="center" valign="top">16.5 (2.6)</td>
<td align="center" valign="top">39.1 (5.0)</td>
<td align="center" valign="top">-0.26 (0.30)</td>
<td align="center" valign="top">20.6 (4.1)</td>
<td align="center" valign="top">11.9 (0.5)</td>
<td align="center" valign="top">5.2 (0.4)</td>
<td align="center" valign="top">0.64 (0.06)</td>
</tr>
<tr>
<td align="left" valign="top">RLARI</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">12.6* (2.9)</td>
<td align="center" valign="top">14.3* (2.5)</td>
<td align="center" valign="top">40.1* (4.8)</td>
<td align="center" valign="top">-0.17* (0.31)</td>
<td align="center" valign="top">16.4* (3.7)</td>
<td align="center" valign="top">12.0 (0.5)</td>
<td align="center" valign="top">5.0* (0.4)</td>
<td align="center" valign="top">0.64 (0.05)</td>
</tr>
<tr>
<td align="left" valign="top">RLAM</td>
<td align="center" valign="top">0.05</td>
<td align="center" valign="top">4.0</td>
<td align="center" valign="top">13.5* (2.8)</td>
<td align="center" valign="top">14.8* (2.4)</td>
<td align="center" valign="top">39.8 (5.1)</td>
<td align="center" valign="top">0.08* (0.29)</td>
<td align="center" valign="top">21.0 (4.2)</td>
<td align="center" valign="top">12.7* (0.5)</td>
<td align="center" valign="top">4.8* (0.4)</td>
<td align="center" valign="top">0.62 (0.06)</td>
</tr>
<tr>
<td align="left" valign="top">RLAM</td>
<td align="center" valign="top">0.2</td>
<td align="center" valign="top">4.0</td>
<td align="center" valign="top">12.5* (2.9)</td>
<td align="center" valign="top">14.0* (2.5)</td>
<td align="center" valign="top">39.8 (5.0)</td>
<td align="center" valign="top">-0.01* (0.32)</td>
<td align="center" valign="top">17.7* (3.4)</td>
<td align="center" valign="top">12.4* (0.5)</td>
<td align="center" valign="top">4.9* (0.4)</td>
<td align="center" valign="top">0.63 (0.05)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We observe that reinforcement learning models trained with different rewards exhibit a notable reduction in reading level, bringing abstracts down to high school levels. The model directly guided by ARI (RLARI) achieves an ARI of 12.6, while the most performant model guided by accessibility measures (RLAM, <italic>&#x03B2;</italic><sub>SL</sub> = 4<italic>.</italic>0 and <italic>&#x03B2;</italic><sub>WA</sub> = 0<italic>.</italic>2) reaches 12.5, both aligning with the readability level expected for individuals who have completed K-12 education (approximately ARI 13). However, RLARI and RLAM models achieve these readability improvements in different ways. For RLAM models, better readability is achieved mostly through improved token-level accessibility. RLAM models show an increase in word accessibility from 0.5 to 0.8 compared to the supervised baseline. This increase in the natural logarithm suggests that words generated by RLAM models are, on average, 1.6 to 2.2 times more frequent in the English Wikipedia corpus than those generated by the SFT model. In comparison, RLARI&#x2019;s 0.1-unit increase in word accessibility, although observed, does not result in a statistically significant change in word frequency compared to the SFT model. Similarly, the log ratio of the VOA1500 vocabulary in the RLAM models shows a significant improvement, with log ratios ranging from <italic>-</italic>0.02 to</p>
<p>0.08. This implies that for every 100 non-VOA1500 (or more complex) words generated, RLAM models can produce approximately 99 to 106 VOA1500 basic words. In contrast, the SFT and RLARI models exhibit VOA log ratios of <italic>-</italic>0.26 and <italic>-</italic>0.17, respectively, indicating that for every 100 non- VOA1500 words generated, these models produce only around 77 (84) VOA1500 words. The average word length in characters for RLAM models ranges from 4.8 to 4.9, slightly shorter than RLARI&#x2019;s and outperforming SFT. Overall, the above evidence suggests that RLAM models achieve better readability by using more common, simpler, and shorter words.</p>
<p>On the other hand, the RLARI model achieves better readability by producing much shorter sentences, with only a marginal boost in word-level accessibility. The RLARI model has the shortest average sentence length of 16.4 words, significantly outperforming the SFT model. In comparison, a significantly shorter average sentence length is only observed when RLAM&#x2019;s sentence length reward coefficient (<italic>&#x03B2;</italic><sub>SL</sub>) exceeds 0.08. We also observed that, in pilot studies, if <italic>&#x03B2;</italic><sub>SL</sub> is set to a higher value, such as 0.5, the model&#x2019;s optimisation will collapse after only a few hundred steps, similar to what we consistently observed in the optimisation of RLARI models. The improvement in the RLARI model&#x2019;s word-level accessibility is inconsistent: while we observed significant gains in the VOA log ratio and word length, the word accessibility measure did not show statistically significant improvement after a Bonferroni correction. That said, although RLARI uses more VOA basic words and shorter words, their frequency in the English Wikipedia corpus is not high enough to result in a significant increase in word accessibility compared to the SFT baseline.</p>
<p>For semantic coherence with the human-written significance statements, the BERT scores from the reinforcement learning models are not significantly worse than those of the SFT model. This finding suggests that the generations from the reinforcement learning models are likely to remain semantically faithful and that the reinforcement learning process does not significantly degrade language quality, at least up to the point before unstable optimisation signals were observed. SARI scores from the reinforcement learning models are significantly, yet only marginally, better than those of the SFT baseline. Since SARI is a measurement of a combination of deletions, additions, and retention operations, a straightforward explanation is not available. However, the higher SARI scores confirm the greater simplification quality of the reinforcement learning models, which is likely due to a combination of simpler words and shorter sentences.</p>
</sec>
<sec id="sec4_2">
<title>Qualitative analysis</title>
<p>We annotated 5% of the generated simplified abstracts from reinforcement learning models guided by ARI (RLARI) and accessibility measures (RLAM, with <italic>&#x03B2;</italic><sub>WA</sub> = 4<italic>.</italic>0 and varying <italic>&#x03B2;</italic><sub>SL</sub> values) with respect to language quality, faithfulness, and completeness, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The test abstracts included three from biological sciences and one each from chemistry; mathematics; evolutionary biology; environmental sciences; ecology; economic sciences; and earth, atmospheric, and planetary sciences.</p>
<fig id="F2">
<label>Figure 2.</label>
<caption><p>We annotated 5% of the generated outputs from reinforcement learning models guided by ARI (RLARI) and accessibility measures (RLAM, with <italic>&#x03B2;</italic><sub>WA</sub> = 4<italic>.</italic>0 and two different <italic>&#x03B2;</italic><sub>SL</sub> values).</p></caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="images\c18-fig2.jpg"><alt-text>none</alt-text></graphic>
</fig>
<p>Regarding overall quality, we found that reinforcement learning-trained models generally produced high-quality language. Compared to the SFT generations, RLAM outputs are often shorter and more semantically complete, due to the imposed token budget for newly generated content (241 tokens, the length of the longest significance statement in the training set). The main shortcoming is the presence of small trailing phrases that often deflate readability scores, such as &#x2018;(PsycINFO Database Record),&#x2019; &#x2018;(show more),&#x2019; and &#x2018;All rights reserved.&#x2019; The first artefact is also found in the SFT model generations and is hypothesised to be carried over from the corpora that Gemma- 2B was previously exposed to, as this pattern does not appear in the SASS corpus. An informal review of the remaining generations suggests that this phenomenon is amplified, appearing even in samples without such trailers in the SFT generations. The latter two artefacts were newly observed and are caused by the reinforcement learning processes. Where any of these issues appear, we annotate them as &#x2018;Acceptable,&#x2019; even though the generations are otherwise fluent and grammatical. We also found that the proportion of these artefacts steadily increases as training continues, and they are usually found in only a subset of samples across different experiments.</p>
<p>In assessments of faithfulness, we did not find any models hallucinating in the generations we examined. However, in informal examinations, we did find that reinforcement learning checkpoints may hallucinate by generating simple but overly hedging or short expressions when over-optimised. Generations from RLARI-trained models often remain unfinished, but they retain the main gist of the abstract. Although we do not observe trailer phrases in the RLARI-generated texts like those found in the RLAM models, this issue frequently arises in other RLARI runs. Subsequent checkpoints often exhibit similar problems and tend to deteriorate rapidly once the model starts cutting corners. This typically occurs shortly before the training process completely fails. However, the reported checkpoint happens to miss this characteristic.</p>
</sec>
</sec>
<sec id="sec5">
<title>Conclusion</title>
<p>To improve the accessibility of scientific literature to the general public, we implemented reinforcement learning techniques to guide language models, extending beyond the traditional cross-entropy objective. Our study demonstrates that carefully balancing accessibility measures at the word and sentence levels can effectively guide Gemma-2B in simplifying scholarly abstracts, outperforming the supervised fine-tuning baseline by a large margin. This approach achieves these improvements without compromising language quality or faithfulness and mitigates the supervised fine-tuning model&#x2019;s tendency to overemphasise research implications. The best model trained using our method successfully adjusts the readability level of scholarly abstracts by approximately six US grade levels in other words, from a postgraduate to a high school level. Compared to the supervised fine-tuning model, the words generated by the model trained via our approach are proven to be more common (1.6 to 2.2 times more frequent), easier (with more VOA basic words), and shorter (by 0.3 to 0.4 characters). This improvement addresses a key limitation of existing corpora, in which the target distribution (i.e., significance statements) often does not adequately prioritise the accessibility of word choice. We hope this work contributes to bridging the gap between scholarship and a broader audience, advancing the understanding and development of better simplification systems, and ultimately fostering a more informed and engaged society.</p>
</sec>
</body>
<back>
<ack>
<title>Acknowledgements</title>
<p>We gratefully acknowledge the support of the Institute of Museum and Library Services (No. RE- 246450-OLS-20) and the National Social Science Fund of China (No. 23&#x0026;ZD221). We also thank the organisers of the LIS Education and Data Science Integrated Network Group (LEADING), including Jane Greenberg, Erik Mitchell, Kenning Arlitsch, Jonathan Wheeler, and Samantha Grabus. Additionally, we are thankful to Coltran Hophan-Nichols and Alexander Salois from the University Information Technology Research Cyberinfrastructure at Montana State University for providing computational resources on the Tempest High Performance Computing System, Doralyn Rossmann for research support, and Deanna Zarrillo for early involvement in the project.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="R1"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Berenbaum</surname><given-names>M. R.</given-names></name></person-group> <year>(2021)</year> <article-title>On COVID-19, cognitive bias, and open access</article-title></element-citation></ref>
<ref id="R2"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Calleja</surname><given-names>N.</given-names></name><name><surname>AbdAllah</surname><given-names>A.</given-names></name><name><surname>Abad</surname><given-names>N.</given-names></name><name><surname>Ahmed</surname><given-names>N.</given-names></name><name><surname>Albarracin</surname><given-names>D.</given-names></name><name><surname>Altieri</surname><given-names>E.</given-names></name><name><surname>Anoko</surname><given-names>J. N.</given-names></name><name><surname>Arcos</surname><given-names>R.</given-names></name><name><surname>Azlan</surname><given-names>A. A.</given-names></name><name><surname>Bayer</surname><given-names>J.</given-names></name><name><surname>Bechmann</surname><given-names>A.</given-names></name><name><surname>Bezbaruah</surname><given-names>S.</given-names></name><name><surname>Briand</surname><given-names>S. C.</given-names></name><name><surname>Brooks</surname><given-names>I.</given-names></name><name><surname>Bucci</surname><given-names>L. M.</given-names></name><name><surname>Burzo</surname><given-names>S.</given-names></name><name><surname>Czerniak</surname><given-names>C.</given-names></name><name><surname>De Domenico</surname><given-names>M.</given-names></name><name><surname>Dunn</surname><given-names>A. G.</given-names></name><name><surname>Ecker</surname><given-names>U. K. H.</given-names></name><name><surname>Espinosa</surname><given-names>L.</given-names></name><name><surname>Francois</surname><given-names>C.</given-names></name><name><surname>Gradon</surname><given-names>K.</given-names></name><name><surname>Gruzd</surname><given-names>A.</given-names></name><name><surname>G&#x00FC;lg&#x00FC;n</surname><given-names>B. S.</given-names></name><name><surname>Haydarov</surname><given-names>R.</given-names></name><name><surname>Hurley</surname><given-names>C.</given-names></name><name><surname>Astuti</surname><given-names>S. I.</given-names></name><name><surname>Ishizumi</surname><given-names>A.</given-names></name><name><surname>Johnson</surname><given-names>N.</given-names></name><name><surname>Johnson Restrepo</surname><given-names>D.</given-names></name><name><surname>Kajimoto</surname><given-names>M.</given-names></name><name><surname>Koyuncu</surname><given-names>A.</given-names></name><name><surname>Kulkarni</surname><given-names>S.</given-names></name><name><surname>Lamichhane</surname><given-names>J.</given-names></name><name><surname>Lewis</surname><given-names>R.</given-names></name><name><surname>Mahajan</surname><given-names>A.</given-names></name><name><surname>Mandil</surname><given-names>A.</given-names></name><name><surname>McAweeney</surname><given-names>E.</given-names></name><name><surname>Messer</surname><given-names>M.</given-names></name><name><surname>Moy</surname><given-names>W.</given-names></name><name><surname>Ndumbi Ngamala</surname><given-names>P.</given-names></name><name><surname>Nguyen</surname><given-names>T.</given-names></name><name><surname>Nunn</surname><given-names>M.</given-names></name><name><surname>Omer</surname><given-names>S. B.</given-names></name><name><surname>Pagliari</surname><given-names>C.</given-names></name><name><surname>Patel</surname><given-names>P.</given-names></name><name><surname>Phuong</surname><given-names>L.</given-names></name><name><surname>Prybylski</surname><given-names>D.</given-names></name><name><surname>Rashidian</surname><given-names>A.</given-names></name><name><surname>Rempel</surname><given-names>E.</given-names></name><name><surname>Rubinelli</surname><given-names>S.</given-names></name><name><surname>Sacco</surname><given-names>P.</given-names></name><name><surname>Schneider</surname><given-names>A.</given-names></name><name><surname>Shu</surname><given-names>K.</given-names></name><name><surname>Smith</surname><given-names>M.</given-names></name><name><surname>Sufehmi</surname><given-names>H.</given-names></name><name><surname>Tangcharoensathien</surname><given-names>V.</given-names></name><name><surname>Terry</surname><given-names>R.</given-names></name><name><surname>Thacker</surname><given-names>N.</given-names></name><name><surname>Trewinnard</surname><given-names>T.</given-names></name><name><surname>Turner</surname><given-names>S.</given-names></name><name><surname>Tworek</surname><given-names>H.</given-names></name><name><surname>Uakkas</surname><given-names>S.</given-names></name><name><surname>Vraga</surname><given-names>E.</given-names></name><name><surname>Wardle</surname><given-names>C.</given-names></name><name><surname>Wasserman</surname><given-names>H.</given-names></name><name><surname>Wilhelm</surname><given-names>E.</given-names></name><name><surname>W&#x00FC;rz</surname><given-names>A.</given-names></name><name><surname>Yau</surname><given-names>B.</given-names></name><name><surname>Zhou</surname><given-names>L.</given-names></name><name><surname>Purnat</surname><given-names>T. D.</given-names></name></person-group> <year>(2021)</year> <article-title>A public health research agenda for managing infodemics: Methods and results of the first who infodemiology conference</article-title><source>JMIR Infodemiology</source><volume>1</volume><issue>1</issue><fpage>e30979</fpage></element-citation></ref>
<ref id="R3"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>DuBay</surname><given-names>W. H.</given-names></name></person-group> <year>(2004)</year> <chapter-title>The principles of readability</chapter-title><source>Technical report, Impact Information</source><publisher-loc>Costa Mesa, CA</publisher-loc></element-citation></ref>
<ref id="R4"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Flesch</surname><given-names>R.</given-names></name></person-group> <year>(1946)</year> <source>The Art of Plain Talk</source><publisher-name>Harper &#x0026; Row</publisher-name><publisher-loc>New York</publisher-loc><comment>first edition</comment></element-citation></ref>
<ref id="R5"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Goldsack</surname><given-names>T.</given-names></name><name><surname>Zhang</surname><given-names>Z.</given-names></name><name><surname>Lin</surname><given-names>C.</given-names></name><name><surname>Scarton</surname><given-names>C.</given-names></name></person-group> <year>(2022)</year> <article-title>Making science simple: Corpora for the lay summarisation of scientific literature</article-title><source>arXiv preprint arXiv:2210.09932</source></element-citation></ref>
<ref id="R6"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Huang</surname><given-names>S.</given-names></name><name><surname>Noukhovitch</surname><given-names>M.</given-names></name><name><surname>Hosseini</surname><given-names>A.</given-names></name><name><surname>Rasul</surname><given-names>K.</given-names></name><name><surname>Wang</surname><given-names>W.</given-names></name><name><surname>Tunstall</surname><given-names>L.</given-names></name></person-group> <year>(2024)</year> <article-title>The N+ implementation details of RLHF with PPO: A case study on TL; DR summarization</article-title><comment>arXiv preprint arXiv:2403.17031</comment></element-citation></ref>
<ref id="R7"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname><given-names>A. N.</given-names></name><name><surname>Laato</surname><given-names>S.</given-names></name><name><surname>Talukder</surname><given-names>S.</given-names></name><name><surname>Sutinen</surname><given-names>E.</given-names></name></person-group> <year>(2020)</year> <article-title>Misinformation sharing and social media fatigue during COVID-19: An affordance and cognitive load perspective</article-title><source>Technological forecasting and social change</source><volume>159</volume><fpage>120201</fpage></element-citation></ref>
<ref id="R8"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Joseph</surname><given-names>S.</given-names></name><name><surname>Kazanas</surname><given-names>K.</given-names></name><name><surname>Reina</surname><given-names>K.</given-names></name><name><surname>Ramanathan</surname><given-names>V. J.</given-names></name><name><surname>Xu</surname><given-names>W.</given-names></name><name><surname>Wallace</surname><given-names>B. C.</given-names></name><name><surname>Li</surname><given-names>J. J.</given-names></name></person-group> <year>(2023)</year> <article-title>Multilingual simplification of medical texts</article-title><source>arXiv preprint arXiv:2305.12532</source></element-citation></ref>
<ref id="R9"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kerwer</surname><given-names>M.</given-names></name><name><surname>Chasiotis</surname><given-names>A.</given-names></name><name><surname>Stricker</surname><given-names>J.</given-names></name><name><surname>G&#x00FC;nther</surname><given-names>A.</given-names></name><name><surname>Rosman</surname><given-names>T.</given-names></name></person-group> <year>(2021)</year> <article-title>Straight From the Scientist&#x2019;s Mouth&#x2014;Plain Language Summaries Promote Laypeople&#x2019;s Comprehension and Knowledge Acquisition When Reading About Individual Research Findings in Psychology</article-title><source>Collabra: Psychology</source><volume>7</volume><issue>1</issue><fpage>18898</fpage></element-citation></ref>
<ref id="R10"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kingma</surname><given-names>D. P.</given-names></name><name><surname>Ba</surname><given-names>J.</given-names></name></person-group> <year>(2015)</year> <article-title>Adam: A method for stochastic optimization</article-title><person-group person-group-type="editor"><name><surname>Bengio</surname><given-names>Y.</given-names></name><name><surname>LeCun</surname><given-names>Y.</given-names></name></person-group><source>3rd International Conference on Learning Representations, ICLR 2015</source><publisher-loc>San Diego, CA, USA</publisher-loc><comment>May 7-9, 2015, Conference Track Proceedings</comment></element-citation></ref>
<ref id="R11"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Loshchilov</surname><given-names>I.</given-names></name><name><surname>Hutter</surname><given-names>F.</given-names></name></person-group> <year>(2017)</year> <article-title>Decoupled weight decay regularization</article-title><source>In International Conference on Learning Representations</source></element-citation></ref>
<ref id="R12"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mazur</surname><given-names>B.</given-names></name></person-group> <year>(2000)</year> <article-title>Revisiting plain language</article-title><source>Technical communication</source><volume>47</volume><issue>2</issue><fpage>205</fpage><lpage>205</lpage></element-citation></ref>
<ref id="R13"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Mesnard</surname><given-names>T.</given-names></name><name><surname>Hardin</surname><given-names>C.</given-names></name><name><surname>Dadashi</surname><given-names>R.</given-names></name><name><surname>Bhupatiraju</surname><given-names>S.</given-names></name><name><surname>Pathak</surname><given-names>S.</given-names></name><name><surname>Sifre</surname><given-names>L.</given-names></name><name><surname>Rivi&#x00E8;re</surname><given-names>M.</given-names></name><name><surname>Kale</surname><given-names>M. S.</given-names></name><name><surname>Love</surname><given-names>J.</given-names></name></person-group><etal/> <year>(2024)</year> <article-title>Gemma: Open models based on gemini research and technology</article-title><source>arXiv preprint arXiv:2403.08295</source></element-citation></ref>
<ref id="R14"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Papineni</surname><given-names>K.</given-names></name><name><surname>Roukos</surname><given-names>S.</given-names></name><name><surname>Ward</surname><given-names>T.</given-names></name><name><surname>Zhu</surname><given-names>W.-J.</given-names></name></person-group> <year>(2002)</year> <article-title>Bleu: a method for automatic evaluation of machine translation</article-title><person-group person-group-type="editor"><name><surname>Isabelle</surname><given-names>P.</given-names></name><name><surname>Charniak</surname><given-names>E.</given-names></name><name><surname>Lin</surname><given-names>D.</given-names></name></person-group><source>Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics</source><fpage>311</fpage><lpage>318</lpage><publisher-loc>Philadelphia, Pennsylvania, USA</publisher-loc><publisher-name>Association for Computational Linguistics</publisher-name></element-citation></ref>
<ref id="R15"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Petelin</surname><given-names>R.</given-names></name></person-group> <year>(2010)</year> <article-title>Considering plain language: Issues and initiatives</article-title><source>Corporate Communications: An International Journal</source><volume>15</volume><issue>2</issue><fpage>205</fpage><lpage>216</lpage></element-citation></ref>
<ref id="R16"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Plav&#x00E9;n-Sigray</surname><given-names>P.</given-names></name><name><surname>Matheson</surname><given-names>G. J.</given-names></name><name><surname>Schiffler</surname><given-names>B. C.</given-names></name><name><surname>Thompson</surname><given-names>W. H.</given-names></name></person-group> <year>(2017)</year> <article-title>Research: The readability of scientific texts is decreasing over time</article-title><source>eLife</source><volume>6</volume><fpage>e27725</fpage></element-citation></ref>
<ref id="R17"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pool</surname><given-names>J.</given-names></name><name><surname>Fatehi</surname><given-names>F.</given-names></name><name><surname>Akhlaghpour</surname><given-names>S.</given-names></name></person-group> <year>(2021)</year> <article-title>Infodemic, misinformation and disinformation in pandemics: Scientific landscape and the road ahead for public health informatics research</article-title><source>In Public Health and Informatics</source><fpage>764</fpage><lpage>768</lpage><publisher-name>IOS Press</publisher-name></element-citation></ref>
<ref id="R18"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Ramamurthy</surname><given-names>R.</given-names></name><name><surname>Ammanabrolu</surname><given-names>P.</given-names></name><name><surname>Brantley</surname><given-names>K.</given-names></name><name><surname>Hessel</surname><given-names>J.</given-names></name><name><surname>Sifa</surname><given-names>R.</given-names></name><name><surname>Bauckhage</surname><given-names>C.</given-names></name><name><surname>Hajishirzi</surname><given-names>H.</given-names></name><name><surname>Choi</surname><given-names>Y.</given-names></name></person-group> <year>(2023)</year> <article-title>Is reinforcement learning (not) for natural language processing: Benchmarks, baselines, and building blocks for natural language policy optimization</article-title><source>In The Eleventh International Conference on Learning Representations</source></element-citation></ref>
<ref id="R19"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Riddell</surname><given-names>A.</given-names></name><name><surname>Igarashi</surname><given-names>Y.</given-names></name></person-group> <year>(2021)</year> <article-title>Varieties of plain language</article-title><person-group person-group-type="author"><name><surname>Mitkov</surname><given-names>R.</given-names></name><name><surname>Angelova</surname><given-names>G.</given-names></name></person-group><source>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</source><fpage>1180</fpage><lpage>1187</lpage></element-citation></ref>
<ref id="R20"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Rothwell</surname><given-names>J.</given-names></name></person-group> <year>(2020)</year> <article-title>Assessing the economic gains of eradicating illiteracy nationally and regionally in the United States</article-title><source>Technical report, Barbara Bush Foundation for Family Literacy and Gallup</source><comment>Accessed</comment> <year>(2024)</year> <comment>-07-12</comment></element-citation></ref>
<ref id="R21"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schriver</surname><given-names>K. A.</given-names></name></person-group> <year>(2017)</year> <article-title>Plain language in the US gains momentum: 1940&#x2013;2015</article-title><source>IEEE Transactions on Professional Communication</source><volume>60</volume><issue>4</issue><fpage>343</fpage><lpage>383</lpage></element-citation></ref>
<ref id="R22"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Schulman</surname><given-names>J.</given-names></name><name><surname>Wolski</surname><given-names>F.</given-names></name><name><surname>Dhariwal</surname><given-names>P.</given-names></name><name><surname>Radford</surname><given-names>A.</given-names></name><name><surname>Klimov</surname><given-names>O.</given-names></name></person-group> <year>(2017)</year> <article-title>Proximal policy optimization algorithms</article-title><source>arXiv preprint arXiv:1707.06347</source></element-citation></ref>
<ref id="R23"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stiennon</surname><given-names>N.</given-names></name><name><surname>Ouyang</surname><given-names>L.</given-names></name><name><surname>Wu</surname><given-names>J.</given-names></name><name><surname>Ziegler</surname><given-names>D.</given-names></name><name><surname>Lowe</surname><given-names>R.</given-names></name><name><surname>Voss</surname><given-names>C.</given-names></name><name><surname>Radford</surname><given-names>A.</given-names></name><name><surname>Amodei</surname><given-names>D.</given-names></name><name><surname>Christiano</surname><given-names>P. F.</given-names></name></person-group> <year>(2020)</year> <article-title>Learning to summarize with human feedback</article-title><source>Advances in Neural Information Processing Systems</source><volume>33</volume><fpage>3008</fpage><lpage>3021</lpage></element-citation></ref>
<ref id="R24"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>H.</given-names></name><name><surname>Clark</surname><given-names>J.</given-names></name></person-group> <year>(2024)</year> <article-title>Simplifying scholarly abstracts for accessible digital libraries using language models</article-title><source>2024 ACM/IEEE Joint Conference on Digital Libraries, JCDL &#x2019;24</source><comment>page 8</comment><publisher-loc>Hong Kong, China</publisher-loc><publisher-name>ACM</publisher-name></element-citation></ref>
<ref id="R25"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>Y.</given-names></name><name><surname>McKee</surname><given-names>M.</given-names></name><name><surname>Torbica</surname><given-names>A.</given-names></name><name><surname>Stuckler</surname><given-names>D.</given-names></name></person-group> <year>(2019)</year> <article-title>Systematic literature review on the spread of health-related misinformation on social media</article-title><source>Social science &#x0026; medicine</source><volume>240</volume><fpage>112552</fpage></element-citation></ref>
<ref id="R26"><element-citation publication-type="other"><person-group person-group-type="author"><collab>Wikimedia Foundation</collab></person-group> <year>(2024)</year> <article-title>VOA Special English Word Book</article-title><comment>[Online; accessed 9 January-2024]</comment></element-citation></ref>
<ref id="R27"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname><given-names>W.</given-names></name><name><surname>Callison-Burch</surname><given-names>C.</given-names></name><name><surname>Napoles</surname><given-names>C.</given-names></name></person-group> <year>(2015)</year> <article-title>Problems in current text simplification research: new data can help</article-title><source>Transactions of the Association for Computational Linguistics</source><volume>3</volume><fpage>283</fpage><lpage>297</lpage></element-citation></ref>
<ref id="R28"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname><given-names>W.</given-names></name><name><surname>Napoles</surname><given-names>C.</given-names></name><name><surname>Pavlick</surname><given-names>E.</given-names></name><name><surname>Chen</surname><given-names>Q.</given-names></name><name><surname>Callison-Burch</surname><given-names>C.</given-names></name></person-group> <year>(2016)</year> <article-title>Optimizing statistical machine translation for text simplification</article-title><source>Transactions of the Association for Computational Linguistics</source><volume>4</volume><fpage>401</fpage><lpage>415</lpage></element-citation></ref>
<ref id="R29"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>T.</given-names></name><name><surname>Kishore</surname><given-names>V.</given-names></name><name><surname>Wu</surname><given-names>F.</given-names></name><name><surname>Weinberger</surname><given-names>K. Q.</given-names></name><name><surname>Artzi</surname><given-names>Y.</given-names></name></person-group> <year>(2019)</year> <article-title>BERTScore: Evaluating text generation with BERT</article-title><source>arXiv preprint arXiv:1904.09675</source></element-citation></ref>
<ref id="R30"><element-citation publication-type="other"><person-group person-group-type="author"><name><surname>Ziegler</surname><given-names>D. M.</given-names></name><name><surname>Stiennon</surname><given-names>N.</given-names></name><name><surname>Wu</surname><given-names>J.</given-names></name><name><surname>Brown</surname><given-names>T. B.</given-names></name><name><surname>Radford</surname><given-names>A.</given-names></name><name><surname>Amodei</surname><given-names>D.</given-names></name><name><surname>Christiano</surname><given-names>P.</given-names></name><name><surname>Irving</surname><given-names>G.</given-names></name></person-group> <year>(2019)</year> <article-title>Fine-tuning language models from human preferences</article-title><source>arXiv preprint arXiv:1909.08593</source></element-citation></ref>
</ref-list>
</back>
</article>