Veri seti Açık Erişim

TWiST: Turkish-English Wikipedia & Thesis STEM Terminology Dataset

Gebeşçe, Ali; Gül Şahin, Gözde; Amasya, Ege Uğur


DataCite XML

<?xml version='1.0' encoding='utf-8'?>
<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
  <identifier identifierType="DOI">10.48623/aperta.286016</identifier>
  <creators>
    <creator>
      <creatorName>Gebeşçe, Ali</creatorName>
      <givenName>Ali</givenName>
      <familyName>Gebeşçe</familyName>
      <nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-7997-0557</nameIdentifier>
      <affiliation>Koç Üniversitesi</affiliation>
    </creator>
    <creator>
      <creatorName>Gül Şahin, Gözde</creatorName>
      <givenName>Gözde</givenName>
      <familyName>Gül Şahin</familyName>
      <nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-0332-1657</nameIdentifier>
      <affiliation>Koç Üniversitesi</affiliation>
    </creator>
    <creator>
      <creatorName>Amasya, Ege Uğur</creatorName>
      <givenName>Ege Uğur</givenName>
      <familyName>Amasya</familyName>
      <affiliation>Koç Üniversitesi</affiliation>
    </creator>
  </creators>
  <titles>
    <title>Twist: Turkish-English Wikipedia &amp; Thesis Stem Terminology Dataset</title>
  </titles>
  <publisher>Aperta</publisher>
  <publicationYear>2025</publicationYear>
  <dates>
    <date dateType="Issued">2025-06-25</date>
  </dates>
  <language>en</language>
  <resourceType resourceTypeGeneral="Dataset"/>
  <alternateIdentifiers>
    <alternateIdentifier alternateIdentifierType="url">https://aperta.ulakbim.gov.tr/record/286016</alternateIdentifier>
  </alternateIdentifiers>
  <relatedIdentifiers>
    <relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.48623/aperta.286015</relatedIdentifier>
  </relatedIdentifiers>
  <version>version_1</version>
  <rightsList>
    <rights rightsURI="https://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial</rights>
    <rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
  </rightsList>
  <descriptions>
    <description descriptionType="Abstract">&lt;p&gt;We introduce &lt;em&gt;&lt;strong&gt;TWiST&lt;/strong&gt;&lt;/em&gt;&amp;mdash;the Turkish-English Wikipedia &amp;amp; Thesis STEM Terminology Dataset&amp;mdash;an expertly curated, sentence-aligned bilingual resource that addresses a key gap in Turkish computational linguistics. &lt;em&gt;TWiST&lt;/em&gt; is a 3,300-sentence Turkish&amp;ndash;English parallel corpus of STEM terminology drawn from two sources: 1,185 sentences from Wikimedia Content Translation dump and 2,115 sentences from 287 graduate-thesis abstracts at Turkey&amp;rsquo;s top six universities. Focused on Mathematics, Physics, and Computer Science, every sentence pair was triple-annotated by 43 trained bilingual annotators following a 30-page guideline, achieving substantial agreement (Fleiss &amp;kappa; &amp;asymp; 0.7). &lt;em&gt;TWiST&lt;/em&gt; ultimately captures 10,157 annotated term instances covering 1,223 distinct English technical terms, offering a high-quality benchmark for bilingual terminology extraction, translation consistency, and terminology-aware NLP.&lt;/p&gt;</description>
  </descriptions>
</resource>
0
0
görüntülenme
indirilme
Tüm sürümler Bu sürüm
Görüntülenme 00
İndirme 00
Veri hacmi 0 Bytes0 Bytes
Tekil görüntülenme 00
Tekil indirme 00

Alıntı yap