Jump to content

Content translation/analytics/queries

Add topic
From mediawiki.org

Pages created during the month in all languages (from the central DB)

[edit]
select count(*) as article_count
from cx_translations
where
    (translation_status = 'published' or translation_target_url is not null) AND
    translation_last_updated_timestamp between 20151000000000 and 20151099999999
order by article_count;

Pages created till the end of a month in all languages (from the central DB)

[edit]
select count(*) as article_count
from cx_translations
where
    (translation_status = 'published' or translation_target_url is not null) AND
    translation_last_updated_timestamp < 20151100000000
order by article_count;

Number of new translators during a month

[edit]
select translation_started_by, count(*) as article_count
from cx_translations
where (translation_status = 'published' or translation_target_url is not null) and
    translation_start_timestamp < '20151100000000'
group by translation_started_by
order by article_count;

Number of translators who first published in a given month. The query gives result for end of a month. To get new translators in a given month, substract the number the query returns for the previous month.

Translator with the highest cumulative number of translations till the end of a month

[edit]
select
    translation_started_by,
    count(translation_target_title)
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null) and
    translation_last_updated_timestamp < 20151100000000
group by translation_started_by
order by count(translation_target_title) desc limit 2;

Language with the most translators

[edit]
select
    translation_target_language,
    count(distinct(translation_started_by)) as translator_count
from
    `cx_translations`
where
    ( translation_status = 'published' or translation_target_url is not null ) and
    translation_last_updated_timestamp < 20151100000000
group by translation_target_language
order by translator_count desc limit 1;

Users having X translations - all time

[edit]
select
    translation_started_by,
    count(translation_id) as article_count
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null)
group by translation_started_by having article_count = 1;

Users having a range of X translations

[edit]
select
    translation_started_by,
    count(translation_id) as article_count
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null)
group by translation_started_by having article_count between 5 and 20;

Users having X translations - in a range of dates

[edit]
select
    translation_started_by,
    count(translation_id) as article_count
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null) and
    translation_start_timestamp like '201511%'
group by translation_started_by having article_count = 1;

Users having a range of X translations - in a range of dates

[edit]
select
    translation_started_by,
    count(translation_id) as article_count
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null) and
    translation_start_timestamp like '201511%'
group by translation_started_by having article_count between 5 and 20;

Total number of translators in a range of dates

[edit]
SELECT
    translation_started_by,
    COUNT(translation_id) AS article_count
FROM
    `cx_translations`
WHERE
    (translation_status = 'published' OR translation_target_url IS NOT NULL) AND
    translation_last_updated_timestamp BETWEEN 20160100000000 AND 20160400000000
GROUP BY
    translation_started_by having article_count > 0;

Language with the most translations

[edit]
select
    translation_target_language,
    count(translation_target_title)
from
    `cx_translations`
where
    ( translation_status = 'published' or translation_target_url is not null ) and
    translation_last_updated_timestamp < 20151100000000
group by translation_target_language
order by count(translation_target_title) desc limit 1;

Pages created in the main namespace (from the wiki)

[edit]
select
    page_id,
    page_namespace,
    page_title,
    max(ct_rev_id),
    ct_params,
    user_name,
    rev_timestamp
from
    `change_tag`,
    `revision`,
    `user`,
    `page`
where
    ct_tag = 'contenttranslation' and
    page_namespace = 0 and
    rev_id = ct_rev_id and
    rev_page = page_id and
    rev_user = user_id
GROUP BY page_id, rev_timestamp
ORDER BY null;

General page creations per day

[edit]
select 
    substr(`timestamp`, 1, 8) as create_date, count(event_title)
from
    log.PageCreation_7481635
where
    webHost = 'ca.wikipedia.org'
        and `timestamp` between 20140701000000 and 20140725000000
        and event_namespace = 0
group by create_date
order by create_date;

Number of users who created pages

[edit]
select articles.user_name, count(articles.user_name) as excitement from
  ( select
      page_id,
      page_namespace,
      page_title,
      max(ct_rev_id),
      ct_params,
      user_name,
      rev_timestamp
    from
      `change_tag`,
      `revision`,
      `user`,
      `page`
    where
      page_namespace = 0 and
      ct_tag = 'contenttranslation' and
      rev_id = ct_rev_id and
      rev_page = page_id and
      rev_user = user_id
    GROUP BY page_id, rev_timestamp
    ORDER BY null
  ) as articles
group by articles.user_name
order by excitement;

Number of users who created more than one page

[edit]

Live example: http://quarry.wmflabs.org/query/3033

select articles.user_name, count(articles.user_name) as excitement from
  ( select
      page_id,
      page_namespace,
      page_title,
      max(ct_rev_id),
      ct_params,
      user_name,
      rev_timestamp
    from
      `change_tag`,
      `revision`,
      `user`,
      `page`
    where
      page_namespace = 0 and
      ct_tag = 'contenttranslation' and
      rev_id = ct_rev_id and
      rev_page = page_id and
      rev_user = user_id
    GROUP BY page_id, rev_timestamp
    ORDER BY null
  ) as articles
group by articles.user_name
having excitement > 1
order by excitement;

Deleted pages

[edit]
SELECT
    yearweek(ar_timestamp) month,
    count(ar_page_id) deletions
FROM
    `change_tag`,
    `archive` 
WHERE
    ar_namespace = '0' AND
    ct_tag = 'contenttranslation' AND
    ar_rev_id = ct_rev_id
GROUP BY
     yearweek(ar_timestamp);

Weekly deletion trend query: http://quarry.wmflabs.org/query/5023

The 5 most prolific translators to a language

[edit]
select
    translation_started_by,
    count(translation_target_title)
from
    `cx_translations`
where
    (translation_status = 'published' or translation_target_url is not null) and
    translation_target_language = 'he'
group by translation_started_by
order by count(translation_target_title) desc
limit 5;

The latest published translation by a given translator ID

[edit]
select
	translation_source_language,
	translation_target_language,
	translation_source_title,
	translation_target_title
from cx_translations
where
	(translation_status = 'published' or translation_target_url is not null) and
	translation_started_by = 1156
order by translation_last_updated_timestamp desc
limit 1;

Language with the most translations

[edit]
select
    translation_target_language,
    count(translation_target_title)
from
    `cx_translations`
where
    ( translation_status = 'published' or translation_target_url is not null ) and
    translation_last_updated_timestamp < 20150900000000
group by translation_target_language
order by count(translation_target_title) desc limit 1;

Number of translations in a language pair on a certain day

[edit]

All:

select
	count(*)
from
	cx_translations
where
	translation_source_language = 'en' and
	translation_target_language = 'ru' and
	translation_start_timestamp like '20151119%';

Published:

select
	count(*)
from
	cx_translations
where
	(translation_status = 'published' or translation_target_url is not null) AND
	translation_source_language = 'en' and
	translation_target_language = 'ru' and
	translation_start_timestamp like '20151119%';

Created pages (from the wiki)

[edit]
SELECT
    page_title
FROM
    `change_tag`,
    `revision`,
    `page` 
WHERE
    ct_tag = 'contenttranslation' AND
    rev_id = ct_rev_id AND
    page_id = rev_page
GROUP BY
    page_title;

Count pages published in a range of dates

[edit]
SELECT
    COUNT(*)
FROM
    cx_translations
WHERE
    ( translation_last_updated_timestamp BETWEEN 20170313000000 AND 20170319999999 ) AND
    ( translation_status = 'published' OR translation_target_url IS NOT NULL );

EventLogging

[edit]

Saving failures

[edit]
SELECT
	timestamp,
	userAgent,
	wiki,
	event_sourceLanguage,
	event_sourceTitle,
	event_targetLanguage,
	event_targetTitle,
	event_token,
	event_trace,
	event_version
FROM
	log.ContentTranslationError_11767097
WHERE
	event_context = 'save-failure' and
	timestamp like '20151221%'
GROUP BY
	event_session
ORDER BY
	timestamp;

Total

[edit]
SELECT
	timestamp,
	userAgent,
	wiki,
	event_sourceLanguage,
	event_sourceTitle,
	event_targetLanguage,
	event_targetTitle,
	event_token,
	event_trace,
	event_version
FROM
	log.ContentTranslationError_11767097
WHERE
	event_context = 'save-failure' and
	timestamp like '20151221%'
ORDER BY
	timestamp;

Restoring failures

[edit]
SELECT
	timestamp,
	userAgent,
	wiki,
	event_sourceLanguage,
	event_sourceTitle,
	event_targetLanguage,
	event_targetTitle,
	event_token,
	event_trace,
	event_version
FROM
	log.ContentTranslationError_11767097
WHERE
	event_context = 'restore-failure' and
	timestamp like '20151221%'
GROUP BY
	event_session
ORDER BY
	timestamp;

Publishing failures

[edit]
SELECT
	timestamp,
	userAgent,
	wiki,
	event_sourceLanguage,
	event_sourceTitle,
	event_targetLanguage,
	event_targetTitle,
	event_token,
	event_trace,
	event_version
FROM
	log.ContentTranslationError_11767097
WHERE
	event_context = 'publish-failure' and
	timestamp like '20151221%'
GROUP BY
	event_session
ORDER BY
	timestamp;

Suggestions

[edit]

Accepted suggestions

[edit]
SELECT
	event_targetLanguage,
	count(event_targetLanguage)
FROM
	log.ContentTranslationCTA_11616099 cta
WHERE
	cta.event_cta like 'suggestions%'
AND
	cta.event_action = 'accept'
GROUP BY
	event_targetLanguage;
Until a particular day
[edit]
SELECT
	event_targetLanguage,
	count(event_targetLanguage)
FROM
	log.ContentTranslationCTA_11616099 cta
WHERE
	cta.event_cta like 'suggestions%' AND
	cta.timestamp < 20160425000000 -- write the appropriate timestamp here
AND
	cta.event_action = 'accept'
GROUP BY
	event_targetLanguage;

Daily

[edit]
SELECT
       event_targetLanguage,
       count(event_targetLanguage)
FROM
       log.ContentTranslationCTA_11616099 cta
WHERE
       cta.event_cta = 'suggestions'
AND
       cta.event_action = 'accept'
GROUP BY
       event_targetLanguage;

Weekly

[edit]
SELECT
       event_targetLanguage, count(event_targetLanguage)
FROM
       log.ContentTranslationCTA_11616099 cta
WHERE
       cta.event_cta = 'suggestions'
AND
       cta.event_action = 'accept' AND timestamp > '20151119000000'
AND
       timestamp < '20151126000000'
GROUP BY
       event_targetLanguage;

Campaign

[edit]

Where CAMPAIGN value can be: newarticle, europeana2802016 etc

SELECT
       count(uuid)
FROM
       log.ContentTranslationCTA_11616099 cta
WHERE 
       timestamp > 20160201000000 
AND
       event_cta ='CAMPAIGN';